In [1]:
import pandas as pd
import numpy as np

dadosBrutos = pd.read_csv('communities.data.csv', header=None)
dadosBrutos = dadosBrutos.replace(['?'], [''])

In [2]:
Z = dadosBrutos.iloc[:, 3:4]

In [3]:
dadosBrutos.drop(dadosBrutos.columns[[3]], axis=1, inplace=True)
X = dadosBrutos.iloc[:, :-1]
Y = dadosBrutos.iloc[ :, -1:]

In [4]:
resultado = []

for Y in X:
    Y = X[Y]
    nao_tem = 0
    
    for caso in Y:
        if caso == '':
            nao_tem = nao_tem + 1
    resultado.append(float(nao_tem)*100/float(len(Y)))

print resultado
print len(resultado)


[0.0, 58.876629889669005, 59.027081243731196, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05015045135406219, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 0.0, 0.0, 0.0, 84.00200601805416, 84.00200601805416, 84.00200601805416, 84.00200601805416, 0.0, 84.00200601805416]
126

In [7]:
new_X = []

for x in range(len(X)):
        if resultado[x] > 0:
            new_X.append(X[x])
print new_X


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-7-4f0bb4e85d1f> in <module>()
      2 
      3 for x in range(len(X)):
----> 4         if resultado[x] > 0:
      5             new_X.append(X[x])
      6 print new_X

IndexError: list index out of range

In [6]:
X = new_X
y = Y


# particionar dados em treino e teste
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1) # .5 metade dos dados para teste

print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-6-baa7122d0fcc> in <module>()
      4 # particionar dados em treino e teste
      5 from sklearn.cross_validation import train_test_split
----> 6 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1) # .5 metade dos dados para teste
      7 
      8 print(len(X_train))

/usr/lib/python2.7/dist-packages/sklearn/cross_validation.pyc in train_test_split(*arrays, **options)
   1904     if test_size is None and train_size is None:
   1905         test_size = 0.25
-> 1906     arrays = indexable(*arrays)
   1907     if stratify is not None:
   1908         cv = StratifiedShuffleSplit(stratify, test_size=test_size,

/usr/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in indexable(*iterables)
    199         else:
    200             result.append(np.array(X))
--> 201     check_consistent_length(*result)
    202     return result
    203 

/usr/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    174     if len(uniques) > 1:
    175         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 176                          "%s" % str(uniques))
    177 
    178 

ValueError: Found arrays with inconsistent numbers of samples: [ 101 1994]

In [16]:
import matplotlib.pyplot as plt
a = X[0]
b = X[120]
plt.scatter(a,b)
plt.show()

sample = dadosBrutos.sample(n=200)

x = sample[0]
y = sample[120]

plt.scatter(x,y)
plt.show()



In [ ]:


In [ ]: