In [ ]:
import numpy as np
import pandas as pd

数据载入


In [ ]:
df = pd.read_csv('breast-cancer-wisconsin.data', names=[
    'Sample code number',
    'Clump Thickness',
    'Uniformity of Cell Size',
    'Uniformity of Cell Shape',
    'Marginal Adhesion',
    'Single Epithelial Cell Size'
    'Bare Nuclei',
    'Bland Chromatin',
    'Normal Nucleoli',
    'Mitoses',
    'Class'
])

In [ ]:
df.head()

In [ ]:
df.shape

In [ ]:
df.info()

In [ ]:
df = df.replace(to_replace='?', value=np.nan)

In [ ]:
df.isnull().sum()

In [ ]:
df.dropna(how='any', inplace=True)

In [ ]:
df.shape

In [ ]:
df.isnull().sum()

测试样本


In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[0:9]], df[df.columns[9]], test_size=0.25, random_state=33)

In [ ]:
y_train.value_counts()

In [ ]:
y_test.value_counts()

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [ ]:
X_train

In [ ]:
lr = LogisticRegression()
sgdb = SGDClassifier()

In [ ]:
lr.fit(X_train, y_train)

In [ ]:
lr_y_predict = lr.predict(X_test)

In [ ]:
lr_y_predict

In [ ]:
y_test

In [ ]:
sgdb.fit(X_train, y_train)

In [ ]:
sgdb_y_predict = sgdb.predict(X_test)

In [ ]:
sgdb_y_predict

性能评测


In [ ]:
from sklearn.metrics import classification_report

In [ ]:
print 'Accuracy of LR', lr.score(X_test, y_test)

In [ ]:
print classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])

In [ ]:
print 'Accuracy of SGDC', sgdb.score(X_test, y_test)

In [ ]:
print classification_report(y_test, sgdb_y_predict, target_names=['Benign', 'Malignant'])

手写体 SVM

加载数据


In [ ]:
from sklearn.datasets import load_digits

In [ ]:
digits = load_digits()
digits.data.shape

测试数据


In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

模型训练


In [ ]:
lsvc = LinearSVC()

In [ ]:
lsvc.fit(X_train, y_train)

In [ ]:
y_predict = lsvc.predict(X_test)

In [ ]:
y_predict

In [ ]:
from sklearn.metrics import classification_report

In [ ]:
print 'Accuracy of SVM', lsvc.score(X_test, y_test)

In [ ]:
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))

监督方法

KNN


In [ ]:
from sklearn.datasets import load_iris

In [ ]:
iris = load_iris()
iris.data.shape

In [ ]:
iris.DESCR

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)

In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)

In [ ]:
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)

In [ ]:
y_predict = knc.predict(X_test)

In [ ]:
from sklearn.metrics import classification_report
print 'Accuracy of K-Nearest Neighbor', knc.score(X_test, y_test)

In [ ]:
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))

K-Means


In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [ ]:
digits_train = pd.read_csv('optdigits.tra', header=None)

In [ ]:
digits_test = pd.read_csv('optdigits.tes', header=None)

In [ ]:
X_train = digits_train[np.arange(64)]
y_train = digits_train[64]

In [ ]:
X_test = digits_test[np.arange(64)]
y_test = digits_test[64]

In [ ]:
from sklearn.cluster import KMeans

In [ ]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)

In [ ]:
y_pred = kmeans.predict(X_test)

In [ ]:
from sklearn import metrics

In [ ]:
print metrics.adjusted_rand_score(y_test, y_pred)

NLTK


In [ ]:
sent1 = 'The cat is walking in the bedroom'
sent2 = 'A dog was running across the kitchen.'

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()

In [ ]:
sentences = [sent1, sent2]

In [ ]:
print count_vec.fit_transform(sentences).toarray()

In [ ]:
print count_vec.get_feature_names()

In [ ]:
import nltk
#nltk.download() # download tokenizers/punkt/english.pickle

In [ ]:
tokens_l = nltk.word_tokenize(sent1)
print tokens_l

Titanic

Variable Definition Key survival Survival 0 = No, 1 = Yes pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd sex Sex Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

Variable Notes

pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way... Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way... Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them.


In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [ ]:
train = pd.read_csv('titanic/train.csv')
#print train.info()
test = pd.read_csv('titanic/test.csv')
#print test.info()

In [ ]:
train.isnull().any()

In [ ]:
features = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']
X_train = train[features]
X_test = test[features]

In [ ]:
X_train['Age'].fillna(train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna('S', inplace=True)
X_test['Age'].fillna(train['Age'].mean(), inplace=True)
X_test['Embarked'].fillna('S', inplace=True)

y_train = train['Survived']

In [ ]:
X_train.isnull().any()

In [ ]:
X_train.shape

In [ ]:
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
print dict_vec.feature_names_

print X_train

In [ ]:
X_test = dict_vec.transform(X_test.to_dict(orient='record'))

In [ ]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [ ]:
from sklearn.cross_validation import cross_val_score

In [ ]:
#cross_val_score(rfc, X_train, y_train, cv=5).mean()

In [ ]:
cross_val_score(rfc, X_train, y_train, cv=5).mean()

In [ ]:
#print test['PassengerId']
rfc.fit(X_train, y_train)
#rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':rfc_y_predict})

In [ ]:
X_test = X_test.astype(np.int)
# rfc_y_predict = rfc.predict(X_test)

In [ ]:
rfc_y_predict = rfc.predict(X_test)

In [ ]:
rfc_y_predict

In [ ]:
rfc_submisson = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})

In [ ]:
rfc_submisson.to_csv('rfc_submission.csv', index=False)

IMDB


In [ ]:
import pandas as pd
import re
from bs4 import BeautifulSoup

In [ ]:
train = pd.read_csv('imdb/labeledTrainData.tsv', header=0,
                delimiter="\t", quoting=3)
test = pd.read_csv('imdb/testData.tsv', header=0, delimiter="\t",
               quoting=3 )

In [ ]:
y_train = train['sentiment']

In [ ]:
def review_to_wordlist(review):
    '''
    Meant for converting each of the IMDB reviews into a list of words.
    '''
    # First remove the HTML.
    review_text = BeautifulSoup(review, "html5lib").get_text()
    
    # Use regular expressions to only include words.
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    # Convert words to lower case and split them into separate words.
    words = review_text.lower().split()
   
    # Return a list of words
    return(words)

In [ ]:
print train.head()
traindata = []
for i in xrange(0,len(train['review'])):
    traindata.append(" ".join(review_to_wordlist(train['review'][i])))
testdata = []
print test.head()
for i in xrange(0,len(test['review'])):
    testdata.append(" ".join(review_to_wordlist(test['review'][i])))

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV

In [ ]:
tfv = TFIV(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

In [ ]:
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata)

tfv.fit(X_all) # This is the slow part!
X_all = tfv.transform(X_all)

X = X_all[:lentrain] # Separate back into training and test sets. 
X_test = X_all[lentrain:]

In [ ]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV

In [ ]:
grid_values = {'C':[30]} # Decide which settings you want for the grid search. 

model_LR = GridSearchCV(LR(dual = True, random_state = 0), 
                        grid_values, scoring = 'roc_auc', cv = 20) 
# Try to set the scoring on what the contest is asking for. 
# The contest says scoring is for area under the ROC curve, so use this.
                        
model_LR.fit(X,y_train) # Fit the model.

In [ ]:
model_LR.grid_scores_

In [ ]:
model_LR.best_estimator_

MNIST


In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('minst/train.csv')
test = pd.read_csv('minst/test.csv')

In [3]:
print train.shape
print test.shape


(42000, 785)
(28000, 784)

In [4]:
y_train = train['label']
X_train = train.drop('label', axis=1)

In [5]:
X_test = test

In [ ]:
import tensorflow as tf
import tensorlayer as tl

sess = tf.InteractiveSession()

# 准备数据
X_train, y_train, X_val, y_val, X_test, y_test = \
                                tl.files.load_mnist_dataset(shape=(-1,784))

# 定义 placeholder
x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
y_ = tf.placeholder(tf.int64, shape=[None, ], name='y_')

# 定义模型
network = tl.layers.InputLayer(x, name='input_layer')
network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
network = tl.layers.DenseLayer(network, n_units=800,
                                act = tf.nn.relu, name='relu1')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop2')
network = tl.layers.DenseLayer(network, n_units=800,
                                act = tf.nn.relu, name='relu2')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop3')
network = tl.layers.DenseLayer(network, n_units=10,
                                act = tf.identity,
                                name='output_layer')
# 定义损失函数和衡量指标
# tl.cost.cross_entropy 在内部使用 tf.nn.sparse_softmax_cross_entropy_with_logits() 实现 softmax
y = network.outputs
cost = tl.cost.cross_entropy(y, y_, name = 'cost')
correct_prediction = tf.equal(tf.argmax(y, 1), y_)
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
y_op = tf.argmax(tf.nn.softmax(y), 1)

# 定义 optimizer
train_params = network.all_params
train_op = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999,
                            epsilon=1e-08, use_locking=False).minimize(cost, var_list=train_params)

# 初始化 session 中的所有参数
tl.layers.initialize_global_variables(sess)

# 列出模型信息
network.print_params()
network.print_layers()

# 训练模型
tl.utils.fit(sess, network, train_op, cost, X_train, y_train, x, y_,
            acc=acc, batch_size=500, n_epoch=500, print_freq=5,
            X_val=X_val, y_val=y_val, eval_train=False)

# 评估模型
tl.utils.test(sess, network, acc, X_test, y_test, x, y_, batch_size=None, cost=cost)

# 把模型保存成 .npz 文件
tl.files.save_npz(network.all_params , name='model.npz')
sess.close()


Load or Download MNIST > data/mnist/
data/mnist/train-images-idx3-ubyte.gz
data/mnist/t10k-images-idx3-ubyte.gz
  [TL] InputLayer  input_layer: (?, 784)
  [TL] DropoutLayer drop1: keep:0.800000 is_fix:False
  [TL] DenseLayer  relu1: 800 relu
  [TL] DropoutLayer drop2: keep:0.500000 is_fix:False
  [TL] DenseLayer  relu2: 800 relu
  [TL] DropoutLayer drop3: keep:0.500000 is_fix:False
  [TL] DenseLayer  output_layer: 10 identity
  param   0: relu1/W:0            (784, 800)         float32_ref (mean: -0.000156755762873, median: -0.000153482658789, std: 0.0879837945104   )   
  param   1: relu1/b:0            (800,)             float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
  param   2: relu2/W:0            (800, 800)         float32_ref (mean: -5.23396010976e-05, median: -9.0194385848e-05 , std: 0.0879286974669   )   
  param   3: relu2/b:0            (800,)             float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
  param   4: output_layer/W:0     (800, 10)          float32_ref (mean: -0.000112081870611, median: -0.000939153658692, std: 0.0878978148103   )   
  param   5: output_layer/b:0     (10,)              float32_ref (mean: 0.0               , median: 0.0               , std: 0.0               )   
  num of params: 1276810
  layer   0: drop1/mul:0          (?, 784)           float32
  layer   1: relu1/Relu:0         (?, 800)           float32
  layer   2: drop2/mul:0          (?, 800)           float32
  layer   3: relu2/Relu:0         (?, 800)           float32
  layer   4: drop3/mul:0          (?, 800)           float32
  layer   5: output_layer/Identity:0 (?, 10)            float32
Start training the network ...
Epoch 1 of 500 took 4.844463s
   val loss: 0.580788
   val acc: 0.815400
Epoch 5 of 500 took 4.839795s
   val loss: 0.293327
   val acc: 0.914300
Epoch 10 of 500 took 4.830770s
   val loss: 0.223286
   val acc: 0.937200
Epoch 15 of 500 took 4.855958s
   val loss: 0.187789
   val acc: 0.949300
Epoch 20 of 500 took 4.826501s
   val loss: 0.164285
   val acc: 0.954800
Epoch 25 of 500 took 4.841586s
   val loss: 0.145372
   val acc: 0.959400

In [ ]: