In [13]:
import pandas as pd
from scipy import *
%pylab inline
from scipy import stats as st


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['show_config', 'log2', 'arccos', 'arctanh', 'fft', 'arcsin', 'sqrt', '__version__', 'power', 'log', 'log10', 'test']
`%matplotlib` prevents importing * from pylab and numpy

In [2]:
arf = open('classifier.arff').readlines()

In [3]:
labels, data = arf[2:48], arf[50:]

In [4]:
def parseArffLabelRow(row):
    tokens = row.split()
    return {'feature': tokens[1], 'type': tokens[2]}

def parseArffDataRow(row):
    tokens = row[:-1].split(',')
    return tokens

In [5]:
featureLabel = [parseArffLabelRow(row) for row in labels]

In [6]:
featureVector = [parseArffDataRow(row) for row in data]
cols = [d['feature'] for d in featureLabel]

In [7]:
df = pd.DataFrame.from_records(featureVector, columns=cols)

In [8]:
df[cols[:-1]] = df[cols[:-1]].astype(float)

In [9]:
stats = df.describe().T

In [10]:
simple, complejo = df[df['class'] == 'simple'], df[df['class'] == 'complejo']

Data Summary

We have to check the first 5 features, seems like it doesn't appear in most of the texts, why?


In [11]:
stats.sort_index(by='std')


Out[11]:
count mean std min 25% 50% 75% max
CRFANP1a 100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
CRFANP1 100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
DRNP 100 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
WRDPRP1p 100 0.000860 0.001826 0.000000 0.000000 0.000000 0.001000 0.008000
CNCCaus 100 0.002000 0.001975 0.000000 0.000000 0.001000 0.003000 0.008000
WRDPRP3p 100 0.001670 0.002184 0.000000 0.000000 0.001000 0.002250 0.010000
CNCADC 100 0.004490 0.003886 0.000000 0.001750 0.004000 0.006000 0.017000
CNCTemp 100 0.004830 0.003900 0.000000 0.002000 0.004000 0.006250 0.017000
WRDPRP2 100 0.002220 0.004049 0.000000 0.000000 0.000000 0.002000 0.022000
WRDPRP1s 100 0.004190 0.007019 0.000000 0.000000 0.001000 0.005000 0.038000
WRDPRP3s 100 0.008000 0.008024 0.000000 0.003000 0.006000 0.010000 0.057000
CNCLogic 100 0.017550 0.011535 0.002000 0.010000 0.013000 0.024500 0.067000
CNCAdd 100 0.018500 0.012542 0.002000 0.010000 0.014000 0.024750 0.070000
WRDPRO 100 0.016940 0.014661 0.001000 0.007000 0.012000 0.023000 0.079000
WRDADJ 100 0.023650 0.015482 0.004000 0.014000 0.017500 0.029750 0.067000
CRFCWOad 100 0.100270 0.018378 0.062343 0.087153 0.102402 0.111531 0.143418
CNCAll 100 0.029740 0.019100 0.004000 0.018000 0.022000 0.039500 0.100000
CRFCWO1d 100 0.059434 0.020989 0.025513 0.048184 0.055663 0.063324 0.158100
WRDADV 100 0.030760 0.021003 0.006000 0.017000 0.024500 0.037000 0.108000
CRFCWO1 100 0.102252 0.024454 0.042829 0.086202 0.100239 0.116207 0.170476
CRFCWOa 100 0.111251 0.031365 0.061651 0.087724 0.103396 0.125696 0.204238
LDTTRa 100 0.473838 0.064986 0.278166 0.433247 0.475819 0.523096 0.646617
WRDVERB 100 0.105770 0.065870 0.027000 0.064000 0.081500 0.129500 0.345000
LDTTRc 100 0.541045 0.066501 0.353709 0.492676 0.544500 0.582268 0.723810
WRDNOUN 100 0.102480 0.067940 0.022000 0.060000 0.074000 0.122500 0.352000
DESWLsy 100 1.997214 0.088195 1.756098 1.947213 1.993584 2.050855 2.199405
SYNNP 100 0.690966 0.123207 0.385965 0.599510 0.691015 0.741420 1.125000
CRFNOa 100 0.233931 0.183333 0.020588 0.090844 0.184025 0.298400 0.867769
DESWLsyd 100 1.162414 0.187860 0.924795 1.060587 1.139837 1.210586 2.416140
CRFSOa 100 0.267925 0.199091 0.026701 0.116055 0.212417 0.359768 0.867769
CRFNO1 100 0.235006 0.201654 0.000000 0.084100 0.186047 0.308894 0.857143
DESWLlt 100 4.441184 0.206859 3.777778 4.329070 4.402947 4.552217 5.004376
CRFSO1 100 0.275649 0.220233 0.000000 0.115440 0.211309 0.375000 0.857143
CRFAOa 100 0.436983 0.226239 0.059710 0.258024 0.403125 0.576633 1.000000
DESWLltd 100 2.696159 0.228805 2.288348 2.513647 2.657821 2.823964 3.529618
CRFAO1 100 0.454773 0.235971 0.000000 0.283435 0.401220 0.666667 1.000000
DESSLd 100 10.088298 2.975030 5.056948 7.937405 9.703606 11.493765 22.244983
RDFFGL 100 81.428393 5.630399 68.191248 77.884241 81.534778 85.349842 91.931527
DRNEG 100 7.330000 6.758541 0.000000 3.000000 5.000000 9.000000 35.000000
DESSL 100 17.622631 6.908486 7.966667 12.919341 16.756538 19.367384 46.375000
DESPC 100 10.450000 11.355037 1.000000 3.000000 6.500000 13.250000 74.000000
DRVP 100 30.390000 24.236584 7.000000 15.000000 21.500000 37.500000 134.000000
DESSC 100 33.390000 28.223815 7.000000 15.750000 22.500000 40.250000 154.000000
DESPL 100 194.306334 263.829803 0.000000 90.802377 148.327189 223.411352 2595.081887
DESWC 100 498.780000 316.936299 111.000000 301.000000 370.000000 661.250000 1585.000000

In [14]:
plot(simple['DESWC'], simple['DESPL'], 'o', label='simple')
plot(complejo['DESWC'], complejo['ç'], 'o', label='complejo')
legend(loc='upper left')


Out[14]:
<matplotlib.legend.Legend at 0x108971c90>

In [15]:
import matplotlib.pyplot as plt

In [25]:
from pandas.tools.plotting import scatter_matrix
df[['DESWC','DESPL']].plot()


Out[25]:
<matplotlib.axes.AxesSubplot at 0x1c351a4d0>

In [37]:
a = scatter_matrix(df, figsize=(20, 20))



In [38]:
a[1]


Out[38]:
array([<matplotlib.axes.AxesSubplot object at 0x14f9da310>,
       <matplotlib.axes.AxesSubplot object at 0x204ad5290>,
       <matplotlib.axes.AxesSubplot object at 0x206ca2cd0>,
       <matplotlib.axes.AxesSubplot object at 0x13b80ef90>,
       <matplotlib.axes.AxesSubplot object at 0x123648150>,
       <matplotlib.axes.AxesSubplot object at 0x12b129990>,
       <matplotlib.axes.AxesSubplot object at 0x12e66c750>,
       <matplotlib.axes.AxesSubplot object at 0x136abf7d0>,
       <matplotlib.axes.AxesSubplot object at 0x136c31750>,
       <matplotlib.axes.AxesSubplot object at 0x1251101d0>,
       <matplotlib.axes.AxesSubplot object at 0x1395b9490>,
       <matplotlib.axes.AxesSubplot object at 0x139f0d610>,
       <matplotlib.axes.AxesSubplot object at 0x136caa0d0>,
       <matplotlib.axes.AxesSubplot object at 0x13951fc10>,
       <matplotlib.axes.AxesSubplot object at 0x14c689c90>,
       <matplotlib.axes.AxesSubplot object at 0x27766dc10>,
       <matplotlib.axes.AxesSubplot object at 0x2031ad690>,
       <matplotlib.axes.AxesSubplot object at 0x20149e950>,
       <matplotlib.axes.AxesSubplot object at 0x203df7ad0>,
       <matplotlib.axes.AxesSubplot object at 0x218067350>,
       <matplotlib.axes.AxesSubplot object at 0x12ce39350>,
       <matplotlib.axes.AxesSubplot object at 0x12c7c7190>,
       <matplotlib.axes.AxesSubplot object at 0x139707110>,
       <matplotlib.axes.AxesSubplot object at 0x134135b50>,
       <matplotlib.axes.AxesSubplot object at 0x122ce4e10>,
       <matplotlib.axes.AxesSubplot object at 0x12454a050>,
       <matplotlib.axes.AxesSubplot object at 0x1293df810>,
       <matplotlib.axes.AxesSubplot object at 0x12e32a5d0>,
       <matplotlib.axes.AxesSubplot object at 0x136fe4650>,
       <matplotlib.axes.AxesSubplot object at 0x139a655d0>,
       <matplotlib.axes.AxesSubplot object at 0x12529f110>,
       <matplotlib.axes.AxesSubplot object at 0x138547310>,
       <matplotlib.axes.AxesSubplot object at 0x138a88490>,
       <matplotlib.axes.AxesSubplot object at 0x12c753cd0>,
       <matplotlib.axes.AxesSubplot object at 0x124c53a90>,
       <matplotlib.axes.AxesSubplot object at 0x1517a1b10>,
       <matplotlib.axes.AxesSubplot object at 0x127bb8a90>,
       <matplotlib.axes.AxesSubplot object at 0x213c7e510>,
       <matplotlib.axes.AxesSubplot object at 0x2033fb7d0>,
       <matplotlib.axes.AxesSubplot object at 0x204c75950>,
       <matplotlib.axes.AxesSubplot object at 0x21152e1d0>,
       <matplotlib.axes.AxesSubplot object at 0x20cddef50>,
       <matplotlib.axes.AxesSubplot object at 0x20268dfd0>,
       <matplotlib.axes.AxesSubplot object at 0x204e0df50>,
       <matplotlib.axes.AxesSubplot object at 0x207ae49d0>], dtype=object)

In [50]:
r3, p3 = st.pearsonr(simple['DESWC'], simple['DESPL'])

In [61]:
labels = df.keys()[:-1]

In [63]:
def get_pearson_tuple(x, y, df):
    r, p = st.pearsonr(df[x], df[y])
    return (x, y, r)

In [74]:
pearson = [get_pearson_tuple(x, y, df) for x in labels for y in labels if x != y]

In [79]:
pearson.sort(key=lambda x: -abs(x[2]) if x[2] != nan else 0)

In [82]:
scatter(df['CRFSO1'], df['CRFNO1'])
scatter(df['CRFSO1'], df['CRFNO1'])


Out[82]:
<matplotlib.collections.PathCollection at 0x10e79d350>

In [ ]: