In [24]:
import sklearn
import os
import csv
import pandas as pd

In [105]:
os.chdir('/home/mancube/Downloads/data/a10/p1a/')
#os.listdir('/home/mancube/Downloads/data/a10')

In [26]:
header_list='T1_xacc,T2_yacc,T3_zacc,T4_xgyro,T5_ygyro,T6_zgyro,T7_xmag,T8_ymag,T9_zmag,RA_xacc,RA_yacc,RA_zacc,RA_xgyro,RA_ygyro,RA_zgyro,RA_xmag,RA_ymag,RA_zmag,LA_xacc,LA_yacc,LA_zacc,LA_xgyro,LA_ygyro,LA_zgyro,LA_xmag,LA_ymag,LA_zmag,RL_xacc,RL_yacc,RL_zacc,RL_xgyro,RL_ygyro,RL_zgyro,RL_xmag,RL_ymag,RL_zmag,LL_xacc,LL_yacc,LL_zacc,LL_xgyro,LL_ygyro,LL_zgyro,LL_xmag,LL_ymag,LL_zmag'
header_list=header_list.split(',')
columns = header_list

In [145]:
a = pd.DataFrame.from_csv('s01.txt') 

#df['0'] = range(125)
#print a[a.columns[0]]

#df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list('ABCD'))

In [124]:
a['T1_xacc'].describe()


Out[124]:
count    125.000000
mean       9.292758
std        1.877415
min        5.809800
25%        7.945300
50%        9.169000
75%       10.595000
max       13.686000
Name: T1_xacc, dtype: float64

In [142]:
plt.figure(); a['T3_zacc'].plot();
plt.show()

In [143]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f']
`%matplotlib` prevents importing * from pylab and numpy

In [ ]:


In [8]:
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification

In [9]:
#X, y = make_classification(n_samples=1000, n_features=4)
X,y =

In [11]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [12]:
X_train = X[:-200]
X_test = X[-200:]
y_train = y[:-200]
y_test = y[-200:]

In [13]:
lr.fit(X_train, y_train)
y_train_predictions = lr.predict(X_train)
y_test_predictions = lr.predict(X_test)

In [19]:
(y_train_predictions == y_train).sum().astype(float) /        y_train.shape[0]


Out[19]:
0.88

In [20]:
(y_test_predictions == y_test).sum().astype(float) /           y_test.shape[0]


Out[20]:
0.90000000000000002

In [21]:
X, y = make_classification(n_samples=5000, n_features=4,         weights=[.95])

In [22]:
sum(y) / (len(y)*1.) #to confirm the class imbalance


Out[22]:
0.055800000000000002

In [79]:
from sklearn.datasets import make_blobs
blobs, classes = make_blobs(500, centers=3)
import matplotlib.pyplot as plt

In [85]:
f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
rgb = np.array(['r', 'g', 'b'])
ax.set_title("Blobs")
plt.show()

In [93]:
from sklearn.cluster import KMeans
kmean = KMeans(n_clusters=3)
kmean.fit(blobs)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10, n_jobs=1, precompute_distances=True,random_state=None, tol=0.0001, verbose=0)

KMeans([[ 0.47819567,  1.80819197],[ 0.08627847,  8.24102715],[ 5.2026125 ,  7.86881767]])

f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
ax.scatter(KMeans[:, 0],  KMeans[:, 1], marker='*', s=250,      color='black', label='Centers')

ax.set_title("Blobs")
ax.legend(loc='best')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-93-039ab80d5507> in <module>()
      8 f, ax = plt.subplots(figsize=(7.5, 7.5))
      9 ax.scatter(blobs[:, 0], blobs[:, 1], color=rgb[classes])
---> 10 ax.scatter(KMeans[:, 0],  KMeans[:, 1], marker='*', s=250,      color='black', label='Centers')
     11 
     12 ax.set_title("Blobs")

TypeError: 'type' object has no attribute '__getitem__'

In [104]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

X_train = [[6], [8], [10], [14],   [18]]
y_train = [[7], [9], [13], [17.5], [18]]
X_test = [[6],  [8],   [11], [16]]
y_test = [[8], [12], [15], [18]]

regressor = LinearRegression()
regressor.fit(X_train, y_train)
xx = np.linspace(0, 26, 100)
yy = regressor.predict(xx.reshape(xx.shape[0], 1))
plt.plot(xx, yy)

quadratic_featurizer = PolynomialFeatures(degree=2)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.transform(X_test)
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quadratic, y_train)
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1))
plt.plot(xx, regressor_quadratic.predict(xx_quadratic), c='r', linestyle='--')
plt.title('Pizza price regressed on diameter')
plt.xlabel('Diameter in inches')
plt.ylabel('Price in dollars')
plt.axis([0, 25, 0, 25])
plt.grid(True)
plt.scatter(X_train, y_train)
plt.show()

print X_train
print X_train_quadratic
print X_test
print X_test_quadratic
print 'Simple linear regression r-squared', regressor.score(X_test, y_test)
print 'Quadratic regression r-squared', regressor_quadratic.score(X_test_quadratic, y_test)


[[6], [8], [10], [14], [18]]
[[   1.    6.   36.]
 [   1.    8.   64.]
 [   1.   10.  100.]
 [   1.   14.  196.]
 [   1.   18.  324.]]
[[6], [8], [11], [16]]
[[   1.    6.   36.]
 [   1.    8.   64.]
 [   1.   11.  121.]
 [   1.   16.  256.]]
Simple linear regression r-squared 0.809726797708
Quadratic regression r-squared 0.867544365635

In [98]:
len([[6], [8], [10], [14],   [18]])


Out[98]:
5

In [100]:
range(5)


Out[100]:
[0, 1, 2, 3, 4]

In [148]:
from sklearn.cross_validation import train_test_split
aT1_xacc= a['T1_xacc']
aT1_zacc= a['T1_zacc']


X_train, X_test, y_train, y_test =   train_test_split(aT1_xacc, aT1_yacc, test_size=0.25,  \
                                                      random_state=33)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-148-cd93f51f8d37> in <module>()
      1 from sklearn.cross_validation import train_test_split
      2 aT1_xacc= a['T1_xacc']
----> 3 aT1_zacc= a['T1_zacc']
      4 
      5 

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'T1_zacc'

In [149]:
print(a.shape)


(125, 45)

In [165]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
array = a.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-165-239f930cf17b> in <module>()
      6 # feature extraction
      7 test = SelectKBest(score_func=chi2, k=4)
----> 8 fit = test.fit(X, Y)
      9 # summarize scores
     10 numpy.set_printoptions(precision=3)

/home/mancube/anaconda2/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.pyc in fit(self, X, y)
    329         self._check_params(X, y)
    330 
--> 331         self.scores_, self.pvalues_ = self.score_func(X, y)
    332         self.scores_ = np.asarray(self.scores_)
    333         self.pvalues_ = np.asarray(self.pvalues_)

/home/mancube/anaconda2/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.pyc in chi2(X, y)
    212     X = check_array(X, accept_sparse='csr')
    213     if np.any((X.data if issparse(X) else X) < 0):
--> 214         raise ValueError("Input X must be non-negative.")
    215 
    216     Y = LabelBinarizer().fit_transform(y)

ValueError: Input X must be non-negative.

In [171]:
numpyMatrix = a['T1_zacc'].as_matrix()


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-171-d52ae7885e8b> in <module>()
----> 1 numpyMatrix = a['T1_zacc'].as_matrix()

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/mancube/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4154)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:4018)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12368)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12322)()

KeyError: 'T1_zacc'

In [184]:
print a.iloc[0:1]
columns=['T1_xacc']#, 'T2_yacc', 'T3_zacc']
columns1=['T4_xgyro']#, 'T5_ygyro', 'T6_zgyro']


        T1_xacc  T2_yacc  T3_zacc  T4_xgyro  T5_ygyro  T6_zgyro  T7_xmag  \
number                                                                     
1        6.4472  0.79436   3.2343   0.33634   0.31322  0.089312 -0.73641   

        T8_ymag  T9_zmag  RA_xacc   ...      RL_zmag  LL_xacc  LL_yacc  \
number                              ...                                  
1      -0.24723 -0.78335   4.7357   ...    -0.026526  -9.3418   5.5313   

        LL_zacc  LL_xgyro  LL_ygyro  LL_zgyro  LL_xmag  LL_ymag  LL_zmag  
number                                                                    
1       -1.0797    1.2502   0.20291   -1.8501  0.35161  0.64004  0.34084  

[1 rows x 45 columns]

In [191]:
import numpy as np
from numpy import convolve
import matplotlib.pyplot as plt
 
def movingaverage (values, window):
    weights = np.repeat(1.0, window)/window
    sma = np.convolve(values, weights, 'valid')
    return sma
#s = columns.Series()

x= columns.values.T.tolist()

y = columns1.values.T.tolist()
 
yMA = movingaverage(y,3)
#print yMA
 
plt.plot(x[len(x)-len(yMA):],yMA)
plt.show()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-191-7a2972184cae> in <module>()
      9 #s = columns.Series()
     10 
---> 11 x= columns.values.T.tolist()
     12 
     13 y = columns1.values.T.tolist()

AttributeError: 'list' object has no attribute 'values'

In [192]:
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)


Explained Variance: [ 0.88854663  0.06159078  0.02579012]
[[ -2.02176587e-03   9.78115765e-02   1.60930503e-02   6.07566861e-02
    9.93110844e-01   1.40108085e-02   5.37167919e-04  -3.56474430e-03]
 [  2.26488861e-02   9.72210040e-01   1.41909330e-01  -5.78614699e-02
   -9.46266913e-02   4.69729766e-02   8.16804621e-04   1.40168181e-01]
 [ -2.24649003e-02   1.43428710e-01  -9.22467192e-01  -3.07013055e-01
    2.09773019e-02  -1.32444542e-01  -6.39983017e-04  -1.25454310e-01]]

In [201]:
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data


array = a.values
X = abs(array[:,])
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)


Explained Variance: [ 0.34402575  0.18453017  0.12745636]
[[  1.39990458e-01   6.63922122e-02  -2.59903121e-02  -1.83158113e-03
    1.39004582e-02   1.44557942e-02   5.55045323e-04   2.58433353e-03
   -1.40466995e-03  -7.71276816e-02   7.75661988e-02   5.65595119e-02
    2.70002592e-03   1.44683477e-02  -3.15823969e-03  -5.89889291e-03
    2.00530268e-03   5.46313780e-04  -3.85955495e-02   4.40771816e-02
    7.80883569e-02  -4.74715076e-04   7.29447766e-04   4.29550721e-02
   -4.83544737e-03   1.52761539e-03   1.85128348e-05   1.97473718e-01
    3.09712664e-01   1.18300713e-01   2.23659748e-02   1.70249115e-02
   -3.52414385e-02   1.30281818e-02  -1.25171684e-02   3.93741236e-03
    2.81613482e-01   8.29924816e-01   1.52355600e-01   4.13729958e-02
    4.39946171e-02  -4.19237238e-02  -6.95000429e-03   6.23916634e-03
   -7.44458757e-04]
 [ -3.92228259e-01  -1.00984099e-02  -4.66887124e-02  -1.12264496e-02
   -5.56092896e-04   2.81357609e-03  -4.44493947e-03   2.97534514e-03
    2.80167387e-03   1.82028635e-02  -1.27869583e-01   6.04237472e-02
    1.42275160e-02   1.84853235e-03  -8.04895741e-03   4.42401965e-03
   -5.15665988e-03   2.80603677e-03   4.23582144e-02  -1.75736987e-01
   -5.83553465e-02  -3.10171557e-03  -5.74238675e-03   6.35002709e-03
    9.92876056e-03  -6.38869497e-03   1.20835154e-04  -2.81323779e-01
   -4.62039953e-01  -6.54429256e-02  -2.38528155e-02   8.36109015e-03
    3.87376529e-02   1.62609765e-02  -8.48765197e-03  -1.54661411e-02
   -4.82770808e-01   5.01625574e-01   1.23602003e-03   1.84922981e-02
    2.17958773e-02  -3.06041013e-02  -1.57118993e-02   1.96767843e-02
    5.86894796e-03]
 [ -2.03072640e-01   2.76863939e-02  -4.99126848e-02  -4.56147099e-03
    5.61872419e-05  -4.10195254e-03  -3.42967773e-03  -2.86904519e-03
    4.48232785e-03   6.54245302e-02  -1.11121995e-01  -1.99500646e-02
   -2.17483313e-02  -9.76192746e-03   4.49571118e-03   3.83259798e-03
   -3.31205207e-03   8.27663646e-04   4.37765473e-03  -3.58870052e-02
   -8.48713137e-02   1.44606048e-02   6.32514387e-03  -1.91817191e-02
    2.91324578e-03  -7.25459795e-04   2.75230224e-04  -2.78271751e-01
   -4.28952080e-01   2.18580274e-01   8.34242396e-03   2.09878484e-02
    4.02233848e-02  -2.01879339e-02   1.81249308e-02   5.56851233e-03
    7.65745351e-01   1.03627617e-02  -3.00781256e-02   4.78940221e-02
   -4.49563693e-02   1.55225412e-01   1.92857237e-02  -1.94838014e-02
    1.19927202e-03]]

In [202]:
x = np.array([0.0, 1.0, 2.0, 3.0,  4.0,  5.0])
y = np.array([0.0, 0.8, 0.9, 0.1, -0.8, -1.0])
z = np.polyfit(x, y, 3)
z


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-202-b98103372c06> in <module>()
     11 # feature extraction
     12 test = SelectKBest(score_func=chi2, k=4)
---> 13 fit = test.fit(X, Y)
     14 # summarize scores
     15 numpy.set_printoptions(precision=3)

/home/mancube/anaconda2/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.pyc in fit(self, X, y)
    329         self._check_params(X, y)
    330 
--> 331         self.scores_, self.pvalues_ = self.score_func(X, y)
    332         self.scores_ = np.asarray(self.scores_)
    333         self.pvalues_ = np.asarray(self.pvalues_)

/home/mancube/anaconda2/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.pyc in chi2(X, y)
    212     X = check_array(X, accept_sparse='csr')
    213     if np.any((X.data if issparse(X) else X) < 0):
--> 214         raise ValueError("Input X must be non-negative.")
    215 
    216     Y = LabelBinarizer().fit_transform(y)

ValueError: Input X must be non-negative.

In [ ]: