19 Tableaux et analyse de données

19.1 Tableau unidimensionnel de données



In [2]:

    
from sympy import isprime
L = [i for i in range(100) if isprime(i)]
L









    Out[2]:





[2,
 3,
 5,
 7,
 11,
 13,
 17,
 19,
 23,
 29,
 31,
 37,
 41,
 43,
 47,
 53,
 59,
 61,
 67,
 71,
 73,
 79,
 83,
 89,
 97]



In [3]:

    
from pandas import Series
s = Series(L)
s









    Out[3]:





0      2
1      3
2      5
3      7
4     11
5     13
6     17
7     19
8     23
9     29
10    31
11    37
12    41
13    43
14    47
15    53
16    59
17    61
18    67
19    71
20    73
21    79
22    83
23    89
24    97
dtype: int64



In [4]:

    
s[13]









    Out[4]:





43



In [7]:

    
s.argmax()









    Out[7]:





24



In [8]:

    
s.cumsum()









    Out[8]:





0        2
1        5
2       10
3       17
4       28
5       41
6       58
7       77
8      100
9      129
10     160
11     197
12     238
13     281
14     328
15     381
16     440
17     501
18     568
19     639
20     712
21     791
22     874
23     963
24    1060
dtype: int64

19.2 Afficher quelques statistiques



In [9]:

    
s.describe()









    Out[9]:





count    25.000000
mean     42.400000
std      29.475979
min       2.000000
25%      17.000000
50%      41.000000
75%      67.000000
max      97.000000
dtype: float64



In [10]:

    
s.mean()









    Out[10]:





42.399999999999999



In [11]:

    
s.std()









    Out[11]:





29.475978920696313



In [12]:

    
s.min()









    Out[12]:





2



In [13]:

    
def f(p):
    return p**2 - 3
s.apply(f)









    Out[13]:





0        1
1        6
2       22
3       46
4      118
5      166
6      286
7      358
8      526
9      838
10     958
11    1366
12    1678
13    1846
14    2206
15    2806
16    3478
17    3718
18    4486
19    5038
20    5326
21    6238
22    6886
23    7918
24    9406
dtype: int64

19.3 Opérations sur une série



In [16]:

    
s * 10000 + 45









    Out[16]:





0      20045
1      30045
2      50045
3      70045
4     110045
5     130045
6     170045
7     190045
8     230045
9     290045
10    310045
11    370045
12    410045
13    430045
14    470045
15    530045
16    590045
17    610045
18    670045
19    710045
20    730045
21    790045
22    830045
23    890045
24    970045
dtype: int64



In [17]:

    
s / sum(s)









    Out[17]:





0     0.001887
1     0.002830
2     0.004717
3     0.006604
4     0.010377
5     0.012264
6     0.016038
7     0.017925
8     0.021698
9     0.027358
10    0.029245
11    0.034906
12    0.038679
13    0.040566
14    0.044340
15    0.050000
16    0.055660
17    0.057547
18    0.063208
19    0.066981
20    0.068868
21    0.074528
22    0.078302
23    0.083962
24    0.091509
dtype: float64



In [18]:

    
t = Series([i**3 for i in range(25)])



In [19]:

    
t









    Out[19]:





0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
dtype: int64



In [22]:

    
s ** t









    Out[22]:





0                       1
1                       3
2                  390625
3    -9223372036854775808
4    -9223372036854775808
5    -9223372036854775808
6    -9223372036854775808
7    -9223372036854775808
8    -9223372036854775808
9    -9223372036854775808
10   -9223372036854775808
11   -9223372036854775808
12   -9223372036854775808
13   -9223372036854775808
14   -9223372036854775808
15   -9223372036854775808
16   -9223372036854775808
17   -9223372036854775808
18   -9223372036854775808
19   -9223372036854775808
20   -9223372036854775808
21   -9223372036854775808
22   -9223372036854775808
23   -9223372036854775808
24   -9223372036854775808
dtype: int64

19.4 Concaténation de deux séries



In [25]:

    
from pandas import concat
df = concat([s,t], axis=1)
df



In [26]:

    
type(df)









    Out[26]:





pandas.core.frame.DataFrame

19.5 Tableau 2-dimensionnel de données



In [34]:

    
import pandas as pa
d = {'nb premiers':s, 'cubes':t}
df = pa.DataFrame(d, columns=['nb premiers', 'cubes'])



In [35]:

    
df









    Out[35]:






  
    
      
      nb premiers
      cubes
    
  
  
    
      0
      2
      0
    
    
      1
      3
      1
    
    
      2
      5
      8
    
    
      3
      7
      27
    
    
      4
      11
      64
    
    
      5
      13
      125
    
    
      6
      17
      216
    
    
      7
      19
      343
    
    
      8
      23
      512
    
    
      9
      29
      729
    
    
      10
      31
      1000
    
    
      11
      37
      1331
    
    
      12
      41
      1728
    
    
      13
      43
      2197
    
    
      14
      47
      2744
    
    
      15
      53
      3375
    
    
      16
      59
      4096
    
    
      17
      61
      4913
    
    
      18
      67
      5832
    
    
      19
      71
      6859
    
    
      20
      73
      8000
    
    
      21
      79
      9261
    
    
      22
      83
      10648
    
    
      23
      89
      12167
    
    
      24
      97
      13824



In [37]:

    
df.describe()









    Out[37]:






  
    
      
      nb premiers
      cubes
    
  
  
    
      count
      25.000000
      25.000000
    
    
      mean
      42.400000
      3600.000000
    
    
      std
      29.475979
      4236.452427
    
    
      min
      2.000000
      0.000000
    
    
      25%
      17.000000
      216.000000
    
    
      50%
      41.000000
      1728.000000
    
    
      75%
      67.000000
      5832.000000
    
    
      max
      97.000000
      13824.000000

19.6 Accéder à une colonne d'un tableau



In [36]:

    
df['cubes']









    Out[36]:





0         0
1         1
2         8
3        27
4        64
5       125
6       216
7       343
8       512
9       729
10     1000
11     1331
12     1728
13     2197
14     2744
15     3375
16     4096
17     4913
18     5832
19     6859
20     8000
21     9261
22    10648
23    12167
24    13824
Name: cubes, dtype: int64

19.7 Afficher les premières/dernières lignes



In [38]:

    
L = [isprime(i) for i in range(10000)]



In [39]:

    
L[:10]









    Out[39]:





[False, False, True, True, False, True, False, True, False, False]



In [40]:

    
L = map(isprime, range(10000))



In [43]:

    
L[:10]









    Out[43]:





[False, False, True, True, False, True, False, True, False, False]



In [44]:

    
s = Series(L)



In [45]:

    
t = s.cumsum()



In [49]:

    
df = pa.DataFrame()



In [50]:

    
df['isprime'] = s



In [51]:

    
df['pi_x'] = t



In [53]:

    
df.head()



In [55]:

    
df.tail(8)

19.8 Sous-tableau



In [56]:

    
df[500:520]

19.9 Ajouter une colonne dans un tableau



In [57]:

    
from math import log



In [58]:

    
10000 / log(10000)









    Out[58]:





1085.7362047581294



In [62]:

    
from math import sqrt
12.29*sqrt(10000)









    Out[62]:





1229.0



In [63]:

    
def x_sur_log_x(x):         
    if x > 1:               
        return x/log(x)     
    else:                   
        return None



In [64]:

    
X = Series(range(10000))
gauss = X.apply(x_sur_log_x)
nous = X.apply(lambda x:12.29*sqrt(x))



In [65]:

    
df['x_logx'] = gauss
df['nous'] = nous



In [66]:

    
df.head()

19.10 Visualiser les données



In [67]:

    
%matplotlib inline



In [68]:

    
df.plot()









    Out[68]:





<matplotlib.axes._subplots.AxesSubplot at 0x19d80a3d0>



In [70]:

    
del df['nous']
df[:100].plot()









    Out[70]:





<matplotlib.axes._subplots.AxesSubplot at 0x19d98a450>



In [79]:

    
from sympy import Li
df['Li_x'] = Series([Li(x).n() for x in range(10000)], dtype='float64')



In [80]:

    
df.head()



In [81]:

    
df.Li_x









    Out[81]:





0         -1.045164
1              -inf
2          0.000000
3          1.118425
4          1.922421
5          2.589425
6          3.177059
7          3.711888
8          4.208555
9          4.676074
10         5.120436
11         5.545845
12         5.955384
13         6.351384
14         6.735662
15         7.109661
16         7.474553
17         7.831301
18         8.180711
19         8.523462
20         8.860136
21         9.191234
22         9.517189
23         9.838383
24        10.155152
25        10.467793
26        10.776570
27        11.081723
28        11.383464
29        11.681988
           ...     
9970    1241.834312
9971    1241.942921
9972    1242.051528
9973    1242.160134
9974    1242.268739
9975    1242.377343
9976    1242.485945
9977    1242.594547
9978    1242.703147
9979    1242.811746
9980    1242.920344
9981    1243.028940
9982    1243.137536
9983    1243.246130
9984    1243.354723
9985    1243.463315
9986    1243.571906
9987    1243.680495
9988    1243.789084
9989    1243.897671
9990    1244.006257
9991    1244.114842
9992    1244.223425
9993    1244.332008
9994    1244.440589
9995    1244.549169
9996    1244.657748
9997    1244.766326
9998    1244.874903
9999    1244.983478
Name: Li_x, dtype: float64



In [84]:

    
df.plot()









    Out[84]:





<matplotlib.axes._subplots.AxesSubplot at 0x19def5250>

19.11 Exporter des données



In [85]:

    
from pandas import ExcelWriter
writer = ExcelWriter('tableau.xlsx')
df.to_excel(writer, 'Feuille 1')
writer.save()



In [87]:

    
df.to_csv('tableau.csv')



In [88]:

    
ls









    



16-02-01.ipynb  16-02-16.ipynb  16-03-22.ipynb  16-04-19.ipynb  tableau.csv
16-02-03.ipynb  16-02-23.ipynb  16-04-12.ipynb  16-05-03.ipynb  tableau.xlsx



In [89]:

    
!head tableau.csv









    



,isprime,pi_x,x_logx,Li_x
0,False,0,,-1.04516378012
1,False,0,,-inf
2,True,1,2.88539008178,0.0
3,True,2,2.73071767988,1.11842481455
4,False,2,2.88539008178,1.92242131492
5,True,3,3.1066746728,2.58942452992
6,False,3,3.34866375931,3.17705861042
7,True,4,3.59728839659,3.71188798588
8,False,4,3.8471867757,4.20855451944

19.12 Importer des données



In [91]:

    
df2 = pa.read_excel('tableau.xlsx')



In [92]:

    
df2.head()



In [ ]:

    
pa.read_csv

19.13 Exemple: analyser des données de data.gov.be



In [ ]:

	0	1
0	2	0
1	3	1
2	5	8
3	7	27
4	11	64
5	13	125
6	17	216
7	19	343
8	23	512
9	29	729
10	31	1000
11	37	1331
12	41	1728
13	43	2197
14	47	2744
15	53	3375
16	59	4096
17	61	4913
18	67	5832
19	71	6859
20	73	8000
21	79	9261
22	83	10648
23	89	12167
24	97	13824

	nb premiers	cubes
count	25.000000	25.000000
mean	42.400000	3600.000000
std	29.475979	4236.452427
min	2.000000	0.000000
25%	17.000000	216.000000
50%	41.000000	1728.000000
75%	67.000000	5832.000000
max	97.000000	13824.000000

	isprime	pi_x
9992	False	1229
9993	False	1229
9994	False	1229
9995	False	1229
9996	False	1229
9997	False	1229
9998	False	1229
9999	False	1229

	isprime	pi_x	x_logx	Li_x
0	False	0	NaN	-1.045164
1	False	0	NaN	-inf
2	True	1	2.885390	0.000000
3	True	2	2.730718	1.118425
4	False	2	2.885390	1.922421