In [1]:
from IPython.display import Image
Image(filename='images/phd053104s.png')
Out[1]:
In [2]:
from IPython.display import IFrame
IFrame('http://pandas.pydata.org/', width=900, height=350)
Out[2]:
In [7]:
import pandas as pd
import numpy as np
In [8]:
more data/train.csv
In [9]:
df = pd.read_csv('data/train.csv')
# Reemplazamos nuestros valores de 'y' con valores enteros.
types = np.sort(np.unique(df['target']))
new_values = dict(zip(types, range(types.shape[0])))
df['target'] = df['target'].map(new_values).astype(np.int32)
**También mirar: read_excel, read_clipboard, read_fwf, read_html, read_json, read_sql** |
In [10]:
df.head(5)
Out[10]:
In [11]:
df.describe()
Out[11]:
In [12]:
df.corr()
Out[12]:
In [13]:
df['feat_11'].corr(df['feat_90'])
Out[13]:
In [14]:
df[['feat_1', 'target']]
Out[14]:
In [15]:
df.loc[2:5, ['id', 'target']]
Out[15]:
In [16]:
df.isnull().any() #df.isnull().isnull() full matrix
Out[16]:
In [17]:
unique_df = df.drop_duplicates()
unique_df.shape, df.shape
Out[17]:
In [14]:
df_with_interpolated_values = df.interpolate(method='linear', axis=0)
# Se pueden utilizar diferentes métodos de interpolacion de datos y sobre cualquiera de los dos ejes
df_with_fill_values = df.fillna(df.mean())
- fillna se tienen varias opciones como bfill y ffill. - dropna saca de la tabla todos los valores incompletos. - interpolate cuenta con varios métodos: spline, pchip, polynomial, etc.** |
In [15]:
df_with_fill_values.info(), df.info()
Out[15]:
- join, concat, merge, combine, etc. para unificar datos de distintos origenes con diferentes patrones ..** |
In [20]:
y = df['target'].values.astype(np.int32)
xs = df[df.columns[1:-1]].values.astype(np.float32)
In [18]:
df.to_csv('/tmp/this_is_my_out.txt', sep='\t', header=None, index=False)
In [18]:
df.loc[:5, :].to_clipboard()
- to_excel - to_json - to_html - to_pickle ... Y sus parámetros** |
In [3]:
from IPython.display import IFrame
IFrame('http://scikit-learn.org/stable/', width=900, height=350)
Out[3]:
In [20]:
from IPython.display import Image
Image(url='http://1.bp.blogspot.com/-ME24ePzpzIM/UQLWTwurfXI/AAAAAAAAANw/W3EETIroA80/s1600/drop_shadows_background.png',
width=1000, height=1000)
Out[20]:
In [21]:
# Project the data to a 2D space for visualization
from sklearn.decomposition import RandomizedPCA # using randomized Singular Value Decomposition
Xp = RandomizedPCA(n_components=2, random_state=1).fit_transform(xs)
Xp
Out[21]:
In [22]:
% matplotlib inline
import matplotlib.pyplot as plt
# get the product class
product_class = np.unique(y)
colors = plt.get_cmap("hsv")
plt.figure(figsize=(10, 4))
for i, p in enumerate(product_class):
mask = (y == p)
plt.scatter(Xp[mask, 0], Xp[mask, 1],
c=colors(1. * i / 11), label=p, alpha=0.2)
plt.legend(loc="best")
Out[22]:
In [23]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn import preprocessing
#X = preprocessing.scale(xs, axis=1)
X_train, X_test, y_train, y_test = train_test_split(xs, y,random_state=1)
print """X_train shape : {}, y_train shape : {}
X_test shape : {}, y_test shape : {}""".format(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
In [24]:
def plot_matrix(clf, X_test, y_test):
plt.clf()
plt.imshow(confusion_matrix(clf.predict(X_test), y_test),
interpolation='nearest', cmap=plt.cm.binary)
plt.colorbar()
plt.xlabel("true label")
plt.ylabel("predicted label")
plt.show()
In [17]:
from sklearn.svm import SVC
sv = SVC(kernel='rbf', cache_size=1000)
sv.fit(X_train, y_train)
print classification_report(sv.predict(X_test), y_test)
print sv.score(X_test, y_test)
plot_matrix(sv, X_test, y_test)
In [25]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=200,
max_features=0.2,
n_jobs=2,
max_depth=None,
min_samples_split=1,
random_state=1).fit(X_train, y_train)
print classification_report(clf.predict(X_test), y_test)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)
In [26]:
importances = clf.feature_importances_
text = map(lambda i: df.columns[1:-1][i], range(93))
plt.figure(figsize=(20, 6))
print importances[::-1].shape
plt.bar(range(93),height=importances, width=1.)
plt.xticks(np.arange(0.5, 93, 1.), text, rotation=90)
plt.xlim((0, 93))
plt.show()
# Top 10
indices = np.argsort(importances)[::-1]
for i in range(10):
print importances[indices[i]], df.columns[1:-1][indices[i]]
In [27]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
parameter_grid = {
'n_estimators': [100, 200],
'max_features': [0.2, 0.5],
#'max_depth': [5., None]
}
grid_search = GridSearchCV(ExtraTreesClassifier(n_jobs=4), parameter_grid,
cv=5, verbose=3)
grid_search.fit(X_train, y_train)
Out[27]:
In [77]:
grid_search.grid_scores_
Out[77]:
In [27]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(strategy='most_frequent',random_state=0).fit(X_train, y_train)
print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)
**También mirar: Pipelines en scikit-learn.** |
In [8]:
from sklearn import preprocessing
In [9]:
df = pd.read_csv('data/train.csv')
X = df.values
np.random.shuffle(X)
X, labels = X[:, 1:-1].astype(np.float32), X[:, -1]
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
In [10]:
# add to kfkd.py
from lasagne import layers
from lasagne.nonlinearities import softmax
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import NeuralNet
import theano
net1 = NeuralNet(
layers= [
('input', layers.InputLayer),
('hidden1', layers.DenseLayer),
('dropout0', layers.DropoutLayer),
('hidden2', layers.DenseLayer),
('output', layers.DenseLayer),
],
# layer parameters:
input_shape = (None, 93),
hidden1_num_units = 500,
dropout0_p = 0.3,
hidden2_num_units = 250,
output_nonlinearity= softmax,
output_num_units= encoder.classes_.shape[0],
# optimization method:
update=nesterov_momentum,
update_learning_rate=theano.shared(np.float32(0.03)),
update_momentum=theano.shared(np.float32(0.9)),
#on_epoch_finished=[
# AdjustVariable('update_learning_rate', start=0.03, stop=0.0001),
# AdjustVariable('update_momentum', start=0.9, stop=0.999),
#EarlyStopping(patience=300)
# ],
regression=False,
max_epochs=100,
verbose=1,
)
net1.fit(X, y)
Out[10]:
In [11]:
from IPython.display import Image
Image(filename='images/convincing.png')
Out[11]:
In [29]:
from IPython.display import IFrame
IFrame('http://matplotlib.org/', width=900, height=350)
Out[29]:
In [30]:
% matplotlib inline
import matplotlib.pyplot as plt
In [31]:
print plt.style.available
plt.style.use(plt.style.available[1])
In [32]:
df = pd.read_csv('data/train.csv')
In [33]:
plt.plot(df['feat_34'].cumsum(), 'b-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.xscale('log')
Out[33]:
In [34]:
plt.plot(df['feat_34'].diff(), 'y-') # con la letra indicamos color y con el siguiente la forma del marcador.
plt.ylabel('Este es el label para y') # colocando labels sobre los ejes.
plt.xlabel('Este label es para x')
#plt.axis([0, 65000, -60, 60])
Out[34]:
In [35]:
from mpl_toolkits.mplot3d import Axes3D
for i, group in df.groupby('target'):
threedee = plt.figure().gca(projection='3d')
threedee.set_title("scatter plot of class number {}".format(i))
X = group['feat_11']
Y = group['feat_14']
Z = group['feat_34']
threedee.scatter(X, Y, Z)
threedee.set_xlabel('feat_11')
threedee.set_ylabel('feat_14')
threedee.set_zlabel('feat_34')
plt.show()
In [33]:
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata
for i, group in df.groupby('target'):
threedee = plt.figure().gca(projection='3d')
threedee.set_title("scatter plot of class number {}".format(i))
X = group['feat_11']
Y = group['feat_14']
Z = group['feat_34']
xi = np.linspace(X.min(),X.max(),100)
yi = np.linspace(Y.min(),Y.max(),100)
zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
xig, yig = np.meshgrid(xi, yi)
surf = threedee.plot_surface(xig, yig, zi, linewidth=0)
threedee.set_xlabel('feat_11')
threedee.set_ylabel('feat_14')
threedee.set_zlabel('feat_34')
plt.show()
In [34]:
for i, group in df.groupby('target'):
threedee = plt.figure().gca(projection='3d')
threedee.set_title("scatter plot of class number {}".format(i))
X = group['feat_11']
Y = group['feat_14']
Z = group['feat_34']
xi = np.linspace(X.min(),X.max(),100)
yi = np.linspace(Y.min(),Y.max(),100)
zi = griddata((X, Y), Z, (xi[None,:], yi[:,None]), method='cubic')
xig, yig = np.meshgrid(xi, yi)
contour = plt.contour(xi,yi,zi,15,linewidths=0.5,color='k')
threedee.set_xlabel('feat_11')
threedee.set_ylabel('feat_14')
threedee.set_zlabel('feat_34')
plt.show()
In [36]:
interest_feat = ['feat_11', 'feat_34', 'feat_14', 'feat_60', 'target']
In [22]:
from pandas.tools.plotting import andrews_curves
andrews_curves(df[interest_feat[:-1]], 'target')
Out[22]:
In [26]:
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(df[interest_feat], 'target', alpha=0.5)
Out[26]:
In [36]:
df_plot = df[interest_feat[:-1]].cumsum()
df_plot.plot(kind='area', alpha=0.5)
Out[36]:
In [37]:
df[interest_feat[:-1]].diff().hist(alpha=0.5, bins=10)
Out[37]:
In [38]:
df.groupby(['target']).target.count().plot(kind='bar')
Out[38]:
In [39]:
df.groupby(['target']).target.count().plot(kind='pie')
Out[39]:
In [40]:
df.boxplot(column='feat_34', by='target', grid=False)
for i in range(9):
y = df.feat_11[df.target==i].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y, 'r.', alpha=0.02)
In [37]:
pd.scatter_matrix(df.loc[:,interest_feat[:-1]], figsize=(12,8), diagonal='kde')
Out[37]: