3 important Python packages
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Python module for manipulating tabular data
pandasDataFramenumpyResources
pandas?80% of the effort in data analysis is spent cleaning data. Hadley Wickham
Efficency
Raw data is often in the wrong format
scikit-learn interfaceStorage may be best in a different format
In [2]:
log_filename = 'lustre_debug.out'
log_df = pd.read_csv(log_filename, sep=':', names=['debug mask', 'subsys', 't1', 't2', 'pid1', 'pid2',
'code', 'func', 'line', 'message1', 'message2'])
log_df.head()
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/parsers.py:1170: DtypeWarning: Columns (2) have mixed types. Specify dtype option on import or set low_memory=False.
data = self._reader.read(nrows)
Out[2]:
debug mask
subsys
t1
t2
pid1
pid2
code
func
line
message1
message2
0
10000
80000
2.1F
1.433384e+09
0
0
0
(ldlm_lib.c
2008
target_recovery_expired()) scratch-MDT0000
recovery timed out; 2 clients are still in re...
1
100
80000
22.0
1.433439e+09
0
28364
0
(service.c
789
ptlrpc_update_export_timer()) updating export ...
NaN
2
100
80000
22.0
1.433439e+09
0
28364
0
(service.c
789
ptlrpc_update_export_timer()) updating export ...
NaN
3
100
80000
22.0
1.433439e+09
0
28364
0
(service.c
789
ptlrpc_update_export_timer()) updating export ...
NaN
4
100
80000
22.0
1.433439e+09
0
28364
0
(service.c
789
ptlrpc_update_export_timer()) updating export ...
NaN
In [3]:
log_df.dtypes
Out[3]:
debug mask int64
subsys int64
t1 object
t2 float64
pid1 int64
pid2 int64
code int64
func object
line int64
message1 object
message2 object
dtype: object
In [4]:
mfds = log_df[log_df['message1'].str.contains('mfd')]
mfds.head()
Out[4]:
debug mask
subsys
t1
t2
pid1
pid2
code
func
line
message1
message2
529
4
80000
22.0
1.433439e+09
0
1563
0
(mdt_open.c
646
mdt_mfd_set_mode()) Change mfd ffff88056d872d4...
NaN
543
4
80000
22.0
1.433439e+09
0
13592
0
(mdt_open.c
646
mdt_mfd_set_mode()) Change mfd ffff8805698b324...
NaN
549
4
80000
22.0
1.433439e+09
0
1444
0
(mdt_open.c
646
mdt_mfd_set_mode()) Change mfd ffff880566f0bac...
NaN
551
4
80000
22.0
1.433439e+09
0
1565
0
(mdt_open.c
646
mdt_mfd_set_mode()) Change mfd ffff880570df6ec...
NaN
555
4
80000
22.0
1.433439e+09
0
26845
0
(mdt_open.c
646
mdt_mfd_set_mode()) Change mfd ffff880567609a4...
NaN
In [5]:
'{0:0.2f}%'.format(len(mfds)/float(len(log_df))*100)
Out[5]:
'40.87%'
In [6]:
csv_filename = '2014-04-24.csv'
df = pd.read_csv(csv_filename, sep=';',
names=['host', 'metric', 'value', 'type', 'units', 'time stamp'] )
df.head()
Out[6]:
host
metric
value
type
units
time stamp
0
oss07
lustre.scratch.ost.obdfilter.OST0017.cache_access
0.00000
float
pages/s
1398382546
1
oss07
lustre.scratch.ost.obdfilter.OST0015.disconnect
0.00000
float
requests/s
1398382546
2
oss07
cpu_intr
0.0
float
%
1398382546
3
oss07
lustre.scratch.ost.obdfilter.hosttotal.cache_a...
0.00000
float
pages/s
1398382546
4
oss07
lustre.scratch.ost.obdfilter.OST0025.connect
0.00000
float
requests/s
1398382546
In [7]:
df.dtypes
Out[7]:
host object
metric object
value object
type object
units object
time stamp int64
dtype: object
In [8]:
df['value'] = df['value'].convert_objects(convert_numeric='force')
df.dtypes
Out[8]:
host object
metric object
value float64
type object
units object
time stamp int64
dtype: object
In [9]:
df[np.isnan(df['value'])]
Out[9]:
host
metric
value
type
units
time stamp
11
oss07
gexec
NaN
string
NaN
1398382546
47
oss07
machine_type
NaN
string
NaN
1398382546
55
oss07
os_release
NaN
string
NaN
1398382546
113
oss07
os_name
NaN
string
NaN
1398382546
446
mds01
gexec
NaN
string
NaN
1398382545
489
mds01
machine_type
NaN
string
NaN
1398382545
496
mds01
os_release
NaN
string
NaN
1398382545
571
mds01
os_name
NaN
string
NaN
1398382545
710
oss08
machine_type
NaN
string
NaN
1398382546
955
oss08
gexec
NaN
string
NaN
1398382546
984
oss08
os_release
NaN
string
NaN
1398382546
1285
oss08
os_name
NaN
string
NaN
1398382546
1788
mds02
gexec
NaN
string
NaN
1398382546
1831
mds02
machine_type
NaN
string
NaN
1398382546
1838
mds02
os_release
NaN
string
NaN
1398382546
1913
mds02
os_name
NaN
string
NaN
1398382546
2045
oss01
gexec
NaN
string
NaN
1398382545
2568
oss02
machine_type
NaN
string
NaN
1398382547
2607
oss02
os_name
NaN
string
NaN
1398382547
2790
oss02
gexec
NaN
string
NaN
1398382547
2810
oss02
os_release
NaN
string
NaN
1398382547
3087
oss03
machine_type
NaN
string
NaN
1398382546
3351
oss03
gexec
NaN
string
NaN
1398382546
3379
oss03
os_release
NaN
string
NaN
1398382546
3727
oss03
os_name
NaN
string
NaN
1398382546
4287
oss04
machine_type
NaN
string
NaN
1398382546
4334
oss04
os_name
NaN
string
NaN
1398382546
4524
oss04
gexec
NaN
string
NaN
1398382546
4549
oss04
os_release
NaN
string
NaN
1398382546
4818
oss05
gexec
NaN
string
NaN
1398382546
...
...
...
...
...
...
...
8346899
oss08
os_release
NaN
string
NaN
1398405595
8347200
oss08
os_name
NaN
string
NaN
1398405595
8347704
mds02
gexec
NaN
string
NaN
1398405595
8347747
mds02
machine_type
NaN
string
NaN
1398405595
8347754
mds02
os_release
NaN
string
NaN
1398405595
8347829
mds02
os_name
NaN
string
NaN
1398405595
8347974
oss01
machine_type
NaN
string
NaN
1398405595
8348013
oss01
os_name
NaN
string
NaN
1398405595
8348195
oss01
gexec
NaN
string
NaN
1398405595
8348215
oss01
os_release
NaN
string
NaN
1398405595
8348490
oss02
machine_type
NaN
string
NaN
1398405595
8348529
oss02
os_name
NaN
string
NaN
1398405595
8348712
oss02
gexec
NaN
string
NaN
1398405595
8348732
oss02
os_release
NaN
string
NaN
1398405595
8349009
oss03
machine_type
NaN
string
NaN
1398405595
8349273
oss03
gexec
NaN
string
NaN
1398405595
8349301
oss03
os_release
NaN
string
NaN
1398405595
8349649
oss03
os_name
NaN
string
NaN
1398405595
8350210
oss04
machine_type
NaN
string
NaN
1398405596
8350257
oss04
os_name
NaN
string
NaN
1398405596
8350447
oss04
gexec
NaN
string
NaN
1398405596
8350472
oss04
os_release
NaN
string
NaN
1398405596
8350741
oss05
gexec
NaN
string
NaN
1398405596
8350778
oss05
machine_type
NaN
string
NaN
1398405596
8350784
oss05
os_release
NaN
string
NaN
1398405596
8350854
oss05
os_name
NaN
string
NaN
1398405596
8351188
oss06
machine_type
NaN
string
NaN
1398405597
8351435
oss06
gexec
NaN
string
NaN
1398405597
8351465
oss06
os_release
NaN
string
NaN
1398405597
8351773
oss06
os_name
NaN
string
NaN
1398405597
52911 rows × 6 columns
In [10]:
np.where(np.isnan(df['value']), 0.0, df['value'])
df[np.isnan(df['value'])]
Out[10]:
host
metric
value
type
units
time stamp
11
oss07
gexec
NaN
string
NaN
1398382546
47
oss07
machine_type
NaN
string
NaN
1398382546
55
oss07
os_release
NaN
string
NaN
1398382546
113
oss07
os_name
NaN
string
NaN
1398382546
446
mds01
gexec
NaN
string
NaN
1398382545
489
mds01
machine_type
NaN
string
NaN
1398382545
496
mds01
os_release
NaN
string
NaN
1398382545
571
mds01
os_name
NaN
string
NaN
1398382545
710
oss08
machine_type
NaN
string
NaN
1398382546
955
oss08
gexec
NaN
string
NaN
1398382546
984
oss08
os_release
NaN
string
NaN
1398382546
1285
oss08
os_name
NaN
string
NaN
1398382546
1788
mds02
gexec
NaN
string
NaN
1398382546
1831
mds02
machine_type
NaN
string
NaN
1398382546
1838
mds02
os_release
NaN
string
NaN
1398382546
1913
mds02
os_name
NaN
string
NaN
1398382546
2045
oss01
gexec
NaN
string
NaN
1398382545
2568
oss02
machine_type
NaN
string
NaN
1398382547
2607
oss02
os_name
NaN
string
NaN
1398382547
2790
oss02
gexec
NaN
string
NaN
1398382547
2810
oss02
os_release
NaN
string
NaN
1398382547
3087
oss03
machine_type
NaN
string
NaN
1398382546
3351
oss03
gexec
NaN
string
NaN
1398382546
3379
oss03
os_release
NaN
string
NaN
1398382546
3727
oss03
os_name
NaN
string
NaN
1398382546
4287
oss04
machine_type
NaN
string
NaN
1398382546
4334
oss04
os_name
NaN
string
NaN
1398382546
4524
oss04
gexec
NaN
string
NaN
1398382546
4549
oss04
os_release
NaN
string
NaN
1398382546
4818
oss05
gexec
NaN
string
NaN
1398382546
...
...
...
...
...
...
...
8346899
oss08
os_release
NaN
string
NaN
1398405595
8347200
oss08
os_name
NaN
string
NaN
1398405595
8347704
mds02
gexec
NaN
string
NaN
1398405595
8347747
mds02
machine_type
NaN
string
NaN
1398405595
8347754
mds02
os_release
NaN
string
NaN
1398405595
8347829
mds02
os_name
NaN
string
NaN
1398405595
8347974
oss01
machine_type
NaN
string
NaN
1398405595
8348013
oss01
os_name
NaN
string
NaN
1398405595
8348195
oss01
gexec
NaN
string
NaN
1398405595
8348215
oss01
os_release
NaN
string
NaN
1398405595
8348490
oss02
machine_type
NaN
string
NaN
1398405595
8348529
oss02
os_name
NaN
string
NaN
1398405595
8348712
oss02
gexec
NaN
string
NaN
1398405595
8348732
oss02
os_release
NaN
string
NaN
1398405595
8349009
oss03
machine_type
NaN
string
NaN
1398405595
8349273
oss03
gexec
NaN
string
NaN
1398405595
8349301
oss03
os_release
NaN
string
NaN
1398405595
8349649
oss03
os_name
NaN
string
NaN
1398405595
8350210
oss04
machine_type
NaN
string
NaN
1398405596
8350257
oss04
os_name
NaN
string
NaN
1398405596
8350447
oss04
gexec
NaN
string
NaN
1398405596
8350472
oss04
os_release
NaN
string
NaN
1398405596
8350741
oss05
gexec
NaN
string
NaN
1398405596
8350778
oss05
machine_type
NaN
string
NaN
1398405596
8350784
oss05
os_release
NaN
string
NaN
1398405596
8350854
oss05
os_name
NaN
string
NaN
1398405596
8351188
oss06
machine_type
NaN
string
NaN
1398405597
8351435
oss06
gexec
NaN
string
NaN
1398405597
8351465
oss06
os_release
NaN
string
NaN
1398405597
8351773
oss06
os_name
NaN
string
NaN
1398405597
52911 rows × 6 columns
In [11]:
metrics = pd.pivot_table(df, index=['time stamp', 'host'], columns='metric', values='value')
In [12]:
metrics.head()
Out[12]:
metric
boottime
bytes_in
bytes_out
cpu_aidle
cpu_idle
cpu_intr
cpu_nice
cpu_num
cpu_sintr
cpu_speed
...
udp_indatagrams
udp_inerrors
udp_outdatagrams
udp_rcvbuferrors
vm_pgmajfault
vm_pgpgin
vm_pgpgout
vm_vmeff
voltages_ps1
voltages_ps2
time stamp
host
1398382545
mds01
1397487952
15782.10
28428.83
0
99.8
0
0
24
0
2793
...
1.99647
0
77.59605
0
0
0.1333
327.1510
0
208
208
oss01
NaN
4599.41
38922.52
0
99.8
0
0
24
0
2793
...
2.00075
0
201.57594
0
0
5.7330
4715.4868
0
206
206
1398382546
mds02
1397485845
1187.14
6520.80
0
99.5
0
0
24
0
2792
...
2.13316
0
43.72972
0
0
0.0000
6.5300
0
208
206
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.11213
0
150.75330
0
0
5.8084
5023.1743
0
206
206
oss04
1397485874
4010.40
72188.80
0
99.7
0
0
24
0
2792
...
2.13326
0
323.72208
0
0
4.1308
2131.8872
0
206
206
5 rows × 2004 columns
In [49]:
idx = pd.IndexSlice
metrics.loc[idx[:,'oss03'],:]
Out[49]:
metric
boottime
bytes_in
bytes_out
cpu_aidle
cpu_idle
cpu_intr
cpu_nice
cpu_num
cpu_sintr
cpu_speed
...
udp_indatagrams
udp_inerrors
udp_outdatagrams
udp_rcvbuferrors
vm_pgmajfault
vm_pgpgin
vm_pgpgout
vm_vmeff
voltages_ps1
voltages_ps2
time stamp
host
1398382546
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.11213
0
150.75330
0
0
5.8084
5023.1743
0
206
206
1398382569
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.11213
0
150.75330
0
0
5.8084
5023.1743
0
206
206
1398382582
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.11213
0
150.75330
0
0
5.8084
5023.1743
0
206
206
1398382598
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.11213
0
150.75330
0
0
5.8084
5023.1743
0
206
206
1398382612
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.09613
0
221.33176
0
0
5.8084
5023.1743
0
206
206
1398382627
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.09613
0
221.33176
0
0
5.8084
5023.1743
0
206
206
1398382640
oss03
1397492325
4464.57
41270.28
0
99.8
0
0
24
0
2793
...
2.09613
0
221.33176
0
0
5.8084
5023.1743
0
206
206
1398382655
oss03
1397492325
4743.13
41552.79
0
99.8
0
0
24
0
2793
...
2.08457
0
155.78372
0
0
1676.0653
5157.7031
0
206
206
1398382669
oss03
1397492325
4743.13
41552.79
0
99.8
0
0
24
0
2793
...
2.08457
0
155.78372
0
0
1676.0653
5157.7031
0
206
206
1398382685
oss03
1397492325
4743.13
41552.79
0
99.8
0
0
24
0
2793
...
2.13159
0
154.93977
0
0
5.9951
2894.4302
0
206
206
1398382699
oss03
1397492325
4743.13
41552.79
0
99.8
0
0
24
0
2793
...
2.13159
0
154.93977
0
0
5.9951
2894.4302
0
206
206
1398382714
oss03
1397492325
251522.98
38802.44
0
99.8
0
0
24
0
2793
...
2.11212
0
150.75256
0
0
9.1085
4489.8394
0
206
206
1398382729
oss03
1397492325
251522.98
38802.44
0
99.8
0
0
24
0
2793
...
2.11212
0
150.75256
0
0
9.1085
4489.8394
0
206
206
1398382743
oss03
1397492325
251522.98
38802.44
0
99.8
0
0
24
0
2793
...
2.13147
0
155.73021
0
0
13.9877
4108.7983
0
206
206
1398382757
oss03
1397492325
251522.98
38802.44
0
99.8
0
0
24
0
2793
...
2.13147
0
155.73021
0
0
13.9877
4108.7983
0
206
206
1398382772
oss03
1397492325
4773.11
36129.64
0
99.8
0
0
24
0
2793
...
2.07911
0
150.78505
0
0
9.3725
1857.0757
0
206
206
1398382790
oss03
1397492325
4773.11
36129.64
0
99.8
0
0
24
0
2793
...
2.07911
0
150.78505
0
0
9.3725
1857.0757
0
206
206
1398382806
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382822
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382838
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382856
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382873
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382889
oss03
1397492325
4773.11
36129.64
0
99.7
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382907
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382920
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382934
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382950
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382965
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382979
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
1398382993
oss03
1397492325
6690.12
36400.23
0
99.8
0
0
24
0
2793
...
2.09736
0
231.70877
0
0
7.5930
6387.1782
0
206
206
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1398405084
oss03
1397492325
5229.93
35836.18
0
99.8
0
0
24
0
2793
...
2.06456
0
155.24191
0
0
3.4631
1269.7725
0
206
206
1398405103
oss03
1397492325
5229.93
35836.18
0
99.8
0
0
24
0
2793
...
2.06456
0
155.24191
0
0
3.4631
1269.7725
0
206
206
1398405119
oss03
1397492325
5229.93
35836.18
0
99.7
0
0
24
0
2793
...
2.06456
0
155.24191
0
0
3.4631
1269.7725
0
206
206
1398405137
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405155
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405173
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405191
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405207
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405225
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405244
oss03
1397492325
5229.93
35836.18
0
99.4
0
0
24
0
2793
...
2.09215
0
165.56871
0
0
22.3990
12691.8154
0
206
206
1398405262
oss03
1397492325
5229.93
35836.18
0
99.8
0
0
24
0
2793
...
2.34304
0
151.37337
0
0
7.3921
3890.7598
0
206
206
1398405278
oss03
1397492325
5229.93
35836.18
0
99.8
0
0
24
0
2793
...
2.34304
0
151.37337
0
0
7.3921
3890.7598
0
206
206
1398405297
oss03
1397492325
5229.93
35836.18
0
99.8
0
0
24
0
2793
...
2.34304
0
151.37337
0
0
7.3921
3890.7598
0
206
206
1398405315
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.08722
0
168.03920
0
0
7.3921
3890.7598
0
206
206
1398405333
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.08722
0
168.03920
0
0
7.3921
3890.7598
0
206
206
1398405350
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405368
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405384
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405402
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405418
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405437
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.07635
0
144.42867
0
0
3.4611
3855.8459
0
206
206
1398405455
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.11212
0
151.41281
0
0
3.3002
2856.6509
0
206
206
1398405471
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.13135
0
153.72330
0
0
8.6586
3943.6567
0
206
206
1398405491
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.13135
0
153.72330
0
0
8.6586
3943.6567
0
206
206
1398405508
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.13135
0
153.72330
0
0
8.6586
3943.6567
0
206
206
1398405526
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.13135
0
153.72330
0
0
8.6586
3943.6567
0
206
206
1398405542
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.13135
0
153.72330
0
0
8.6586
3943.6567
0
206
206
1398405561
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.11220
0
150.75810
0
0
4.0924
3790.6047
0
206
206
1398405577
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.11220
0
150.75810
0
0
4.0924
3790.6047
0
206
206
1398405595
oss03
1397492325
7175.45
1123138.88
0
99.8
0
0
24
0
2793
...
2.11220
0
150.75810
0
0
4.0924
3790.6047
0
206
206
1323 rows × 2004 columns
In [117]:
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA
In [118]:
A = metrics.as_matrix()
A.shape
Out[118]:
(13230, 2004)
In [119]:
A[np.where(np.isnan(A))] = 0.0
In [120]:
B = scale(A)
pca = PCA().fit(B)
In [107]:
B_reduced = pca.transform(B)
In [121]:
pca.explained_variance_ratio_[0:20]
Out[121]:
array([ 0.06783672, 0.05587189, 0.03864337, 0.03766685, 0.03579751,
0.03239015, 0.0317283 , 0.03165693, 0.01985705, 0.01842889,
0.01471589, 0.01238719, 0.0119161 , 0.00981694, 0.00932759,
0.00892568, 0.00790127, 0.0077229 , 0.00754279, 0.0074405 ])
In [ ]:
Content source: ResearchComputing/xsede_2015
Similar notebooks: