19 Tableaux et analyse de données

19.1 Tableau unidimensionnel de données


In [2]:
from sympy import isprime
L = [i for i in range(100) if isprime(i)]
L


Out[2]:
[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]

In [3]:
from pandas import Series
s = Series(L)
s


Out[3]:
0      2
1      3
2      5
3      7
4     11
5     13
6     17
7     19
8     23
9     29
10    31
11    37
12    41
13    43
14    47
15    53
16    59
17    61
18    67
19    71
20    73
21    79
22    83
23    89
24    97
dtype: int64

In [4]:
s[13]


Out[4]:
43

In [7]:
s.argmax()


Out[7]:
24

In [8]:
s.cumsum()


Out[8]:
0        2
1        5
2       10
3       17
4       28
5       41
6       58
7       77
8      100
9      129
10     160
11     197
12     238
13     281
14     328
15     381
16     440
17     501
18     568
19     639
20     712
21     791
22     874
23     963
24    1060
dtype: int64

19.2 Afficher quelques statistiques


In [9]:
s.describe()


Out[9]:
count    25.000000
mean     42.400000
std      29.475979
min       2.000000
25%      17.000000
50%      41.000000
75%      67.000000
max      97.000000
dtype: float64

In [10]:
s.mean()


Out[10]:
42.399999999999999

In [11]:
s.std()


Out[11]:
29.475978920696313

In [12]:
s.min()


Out[12]:
2

In [13]:
def f(p):
    return p**2 - 3
s.apply(f)


Out[13]:
0        1
1        6
2       22
3       46
4      118
5      166
6      286
7      358
8      526
9      838
10     958
11    1366
12    1678
13    1846
14    2206
15    2806
16    3478
17    3718
18    4486
19    5038
20    5326
21    6238
22    6886
23    7918
24    9406
dtype: int64

19.3 Opérations sur une série


In [16]:
s * 10000 + 45


Out[16]:
0      20045
1      30045
2      50045
3      70045
4     110045
5     130045
6     170045
7     190045
8     230045
9     290045
10    310045
11    370045
12    410045
13    430045
14    470045
15    530045
16    590045
17    610045
18    670045
19    710045
20    730045
21    790045
22    830045
23    890045
24    970045
dtype: int64

In [17]:
s / sum(s)


Out[17]:
0     0.001887
1     0.002830
2     0.004717
3     0.006604
4     0.010377
5     0.012264
6     0.016038
7     0.017925
8     0.021698
9     0.027358
10    0.029245
11    0.034906
12    0.038679
13    0.040566
14    0.044340
15    0.050000
16    0.055660
17    0.057547
18    0.063208
19    0.066981
20    0.068868
21    0.074528
22    0.078302
23    0.083962
24    0.091509
dtype: float64

In [18]:
t = Series([i**3 for i in range(25)])

In [19]:
t


Out[19]:
0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
dtype: int64

In [22]:
s ** t


Out[22]:
0                       1
1                       3
2                  390625
3    -9223372036854775808
4    -9223372036854775808
5    -9223372036854775808
6    -9223372036854775808
7    -9223372036854775808
8    -9223372036854775808
9    -9223372036854775808
10   -9223372036854775808
11   -9223372036854775808
12   -9223372036854775808
13   -9223372036854775808
14   -9223372036854775808
15   -9223372036854775808
16   -9223372036854775808
17   -9223372036854775808
18   -9223372036854775808
19   -9223372036854775808
20   -9223372036854775808
21   -9223372036854775808
22   -9223372036854775808
23   -9223372036854775808
24   -9223372036854775808
dtype: int64

19.4 Concaténation de deux séries


In [25]:
from pandas import concat
df = concat([s,t], axis=1)
df


Out[25]:
0 1
0 2 0
1 3 1
2 5 8
3 7 27
4 11 64
5 13 125
6 17 216
7 19 343
8 23 512
9 29 729
10 31 1000
11 37 1331
12 41 1728
13 43 2197
14 47 2744
15 53 3375
16 59 4096
17 61 4913
18 67 5832
19 71 6859
20 73 8000
21 79 9261
22 83 10648
23 89 12167
24 97 13824

In [26]:
type(df)


Out[26]:
pandas.core.frame.DataFrame

19.5 Tableau 2-dimensionnel de données


In [34]:
import pandas as pa
d = {'nb premiers':s, 'cubes':t}
df = pa.DataFrame(d, columns=['nb premiers', 'cubes'])

In [35]:
df


Out[35]:
nb premiers cubes
0 2 0
1 3 1
2 5 8
3 7 27
4 11 64
5 13 125
6 17 216
7 19 343
8 23 512
9 29 729
10 31 1000
11 37 1331
12 41 1728
13 43 2197
14 47 2744
15 53 3375
16 59 4096
17 61 4913
18 67 5832
19 71 6859
20 73 8000
21 79 9261
22 83 10648
23 89 12167
24 97 13824

In [37]:
df.describe()


Out[37]:
nb premiers cubes
count 25.000000 25.000000
mean 42.400000 3600.000000
std 29.475979 4236.452427
min 2.000000 0.000000
25% 17.000000 216.000000
50% 41.000000 1728.000000
75% 67.000000 5832.000000
max 97.000000 13824.000000

19.6 Accéder à une colonne d'un tableau


In [36]:
df['cubes']


Out[36]:
0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
Name: cubes, dtype: int64

19.7 Afficher les premières/dernières lignes


In [38]:
L = [isprime(i) for i in range(10000)]

In [39]:
L[:10]


Out[39]:
[False, False, True, True, False, True, False, True, False, False]

In [40]:
L = map(isprime, range(10000))

In [43]:
L[:10]


Out[43]:
[False, False, True, True, False, True, False, True, False, False]

In [44]:
s = Series(L)

In [45]:
t = s.cumsum()

In [49]:
df = pa.DataFrame()

In [50]:
df['isprime'] = s

In [51]:
df['pi_x'] = t

In [53]:
df.head()


Out[53]:
isprime pi_x
0 False 0
1 False 0
2 True 1
3 True 2
4 False 2

In [55]:
df.tail(8)


Out[55]:
isprime pi_x
9992 False 1229
9993 False 1229
9994 False 1229
9995 False 1229
9996 False 1229
9997 False 1229
9998 False 1229
9999 False 1229

19.8 Sous-tableau


In [56]:
df[500:520]


Out[56]:
isprime pi_x
500 False 95
501 False 95
502 False 95
503 True 96
504 False 96
505 False 96
506 False 96
507 False 96
508 False 96
509 True 97
510 False 97
511 False 97
512 False 97
513 False 97
514 False 97
515 False 97
516 False 97
517 False 97
518 False 97
519 False 97

19.9 Ajouter une colonne dans un tableau


In [57]:
from math import log

In [58]:
10000 / log(10000)


Out[58]:
1085.7362047581294

In [62]:
from math import sqrt
12.29*sqrt(10000)


Out[62]:
1229.0

In [63]:
def x_sur_log_x(x):         
    if x > 1:               
        return x/log(x)     
    else:                   
        return None

In [64]:
X = Series(range(10000))
gauss = X.apply(x_sur_log_x)
nous = X.apply(lambda x:12.29*sqrt(x))

In [65]:
df['x_logx'] = gauss
df['nous'] = nous

In [66]:
df.head()


Out[66]:
isprime pi_x x_logx nous
0 False 0 NaN 0.000000
1 False 0 NaN 12.290000
2 True 1 2.885390 17.380685
3 True 2 2.730718 21.286904
4 False 2 2.885390 24.580000

19.10 Visualiser les données


In [67]:
%matplotlib inline

In [68]:
df.plot()


Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x19d80a3d0>

In [70]:
del df['nous']
df[:100].plot()


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x19d98a450>

In [79]:
from sympy import Li
df['Li_x'] = Series([Li(x).n() for x in range(10000)], dtype='float64')

In [80]:
df.head()


Out[80]:
isprime pi_x x_logx Li_x
0 False 0 NaN -1.045164
1 False 0 NaN -inf
2 True 1 2.885390 0.000000
3 True 2 2.730718 1.118425
4 False 2 2.885390 1.922421

In [81]:
df.Li_x


Out[81]:
0         -1.045164
1              -inf
2          0.000000
3          1.118425
4          1.922421
5          2.589425
6          3.177059
7          3.711888
8          4.208555
9          4.676074
10         5.120436
11         5.545845
12         5.955384
13         6.351384
14         6.735662
15         7.109661
16         7.474553
17         7.831301
18         8.180711
19         8.523462
20         8.860136
21         9.191234
22         9.517189
23         9.838383
24        10.155152
25        10.467793
26        10.776570
27        11.081723
28        11.383464
29        11.681988
           ...     
9970    1241.834312
9971    1241.942921
9972    1242.051528
9973    1242.160134
9974    1242.268739
9975    1242.377343
9976    1242.485945
9977    1242.594547
9978    1242.703147
9979    1242.811746
9980    1242.920344
9981    1243.028940
9982    1243.137536
9983    1243.246130
9984    1243.354723
9985    1243.463315
9986    1243.571906
9987    1243.680495
9988    1243.789084
9989    1243.897671
9990    1244.006257
9991    1244.114842
9992    1244.223425
9993    1244.332008
9994    1244.440589
9995    1244.549169
9996    1244.657748
9997    1244.766326
9998    1244.874903
9999    1244.983478
Name: Li_x, dtype: float64

In [84]:
df.plot()


Out[84]:
<matplotlib.axes._subplots.AxesSubplot at 0x19def5250>

19.11 Exporter des données


In [85]:
from pandas import ExcelWriter
writer = ExcelWriter('tableau.xlsx')
df.to_excel(writer, 'Feuille 1')
writer.save()

In [87]:
df.to_csv('tableau.csv')

In [88]:
ls


16-02-01.ipynb  16-02-16.ipynb  16-03-22.ipynb  16-04-19.ipynb  tableau.csv
16-02-03.ipynb  16-02-23.ipynb  16-04-12.ipynb  16-05-03.ipynb  tableau.xlsx

In [89]:
!head tableau.csv


,isprime,pi_x,x_logx,Li_x
0,False,0,,-1.04516378012
1,False,0,,-inf
2,True,1,2.88539008178,0.0
3,True,2,2.73071767988,1.11842481455
4,False,2,2.88539008178,1.92242131492
5,True,3,3.1066746728,2.58942452992
6,False,3,3.34866375931,3.17705861042
7,True,4,3.59728839659,3.71188798588
8,False,4,3.8471867757,4.20855451944

19.12 Importer des données


In [91]:
df2 = pa.read_excel('tableau.xlsx')

In [92]:
df2.head()


Out[92]:
isprime pi_x x_logx Li_x
0 False 0 NaN -1.045164
1 False 0 NaN -inf
2 True 1 2.885390 0.000000
3 True 2 2.730718 1.118425
4 False 2 2.885390 1.922421

In [ ]:
pa.read_csv

19.13 Exemple: analyser des données de data.gov.be


In [ ]: