-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.py
57 lines (50 loc) · 2.26 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import re
import copy as cp
"""
Functions:
- val_manquante(attribute, dataset): Identify indices of missing values in the given attribute.
- calcul_mediane(attribute, dataset): Calculate the median of the given attribute.
- tendance_centrales_homeMade(attribute, dataset): Calculate central tendencies (mean, median, mode) of the given attribute.
- quartilles_homeMade(attribute, dataset): Calculate quartiles of the given attribute.
- ecart_type_home_made(attribute, dataset): Calculate the standard deviation of the given attribute.
"""
def val_manquante(attribute, dataset):
L=[]
for i in range(0,len(dataset[:,attribute])):
if not re.fullmatch(r"\d+\.(:?\d+)?", str(dataset[i, attribute])):
L.append(i)
return L
def calcul_mediane(attribute, dataset):
datasetCurrated=np.delete(dataset[:,attribute], val_manquante(attribute, dataset))
liste = cp.deepcopy(datasetCurrated)
liste.sort()
if liste.size % 2 !=0 :
mediane=liste[((liste.size+1)//2) -1]
else :
mediane=(liste[(liste.size//2)-1]+liste[liste.size//2])/2
return mediane
def tendance_centrales_homeMade(attribute, dataset):
datasetCurrated=np.delete(dataset[:,attribute], val_manquante(attribute, dataset))
moyenne2 = datasetCurrated.sum() / datasetCurrated.shape[0]
mediane2 = calcul_mediane(attribute, dataset)
unique_values, counts = np.unique(datasetCurrated, return_counts=True)
Indicemax = np.where(counts == max(counts))[0]
mode2=[unique_values[i] for i in Indicemax]
return [moyenne2,mediane2,mode2]
def quartilles_homeMade(attribute, dataset):
datasetCurrated=np.delete(dataset[:,attribute], val_manquante(attribute, dataset))
liste = cp.deepcopy(datasetCurrated)
liste.sort()
q0=liste[0]
q1=(liste[liste.size//4-1]+liste[liste.size//4]) /2
q3=(liste[liste.size*3//4-1]+liste[liste.size*3//4]) /2
q2=calcul_mediane(attribute, dataset)
q4=liste[-1]
return [q0,q1,q2,q3,q4]
def ecart_type_home_made(attribute, dataset):
datasetCurrated=np.delete(dataset[:,attribute], val_manquante(attribute, dataset))
mean = np.mean(datasetCurrated)
ecarts = [(val - mean) ** 2 for val in datasetCurrated]
variance = np.mean(ecarts)
return np.sqrt(variance)