import numpy as np
import pandas as pd
df = pd.read_csv('', names=[
'Sample code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size'
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
df = df.replace(to_replace='?', value=np.nan)
df.dropna(how='any', inplace=True)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[0:9]], df[df.columns[9]], test_size=0.25, random_state=33)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
lr = LogisticRegression()
sgdb = SGDClassifier()
In [ ]:, y_train)
lr_y_predict = lr.predict(X_test)
In [ ]:, y_train)
sgdb_y_predict = sgdb.predict(X_test)
from sklearn.metrics import classification_report
print 'Accuracy of LR', lr.score(X_test, y_test)
print classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])
print 'Accuracy of SGDC', sgdb.score(X_test, y_test)
print classification_report(y_test, sgdb_y_predict, target_names=['Benign', 'Malignant'])
from sklearn.datasets import load_digits
digits = load_digits()
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(,, test_size=0.25, random_state=33)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
lsvc = LinearSVC()
In [ ]:, y_train)
y_predict = lsvc.predict(X_test)
from sklearn.metrics import classification_report
print 'Accuracy of SVM', lsvc.score(X_test, y_test)
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))
from sklearn.datasets import load_iris
iris = load_iris()
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(,, test_size=0.25, random_state=33)
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
knc = KNeighborsClassifier(), y_train)
y_predict = knc.predict(X_test)
from sklearn.metrics import classification_report
print 'Accuracy of K-Nearest Neighbor', knc.score(X_test, y_test)
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
digits_train = pd.read_csv('optdigits.tra', header=None)
digits_test = pd.read_csv('optdigits.tes', header=None)
X_train = digits_train[np.arange(64)]
y_train = digits_train[64]
X_test = digits_test[np.arange(64)]
y_test = digits_test[64]
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10)
y_pred = kmeans.predict(X_test)
from sklearn import metrics
print metrics.adjusted_rand_score(y_test, y_pred)
sent1 = 'The cat is walking in the bedroom'
sent2 = 'A dog was running across the kitchen.'
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
sentences = [sent1, sent2]
print count_vec.fit_transform(sentences).toarray()
print count_vec.get_feature_names()
import nltk # download tokenizers/punkt/english.pickle
tokens_l = nltk.word_tokenize(sent1)
print tokens_l
Variable Definition Key
survival Survival 0 = No, 1 = Yes
pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way... Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way... Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')
features = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']
X_train = train[features]
X_test = test[features]
X_train['Age'].fillna(train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna('S', inplace=True)
X_test['Age'].fillna(train['Age'].mean(), inplace=True)
X_test['Embarked'].fillna('S', inplace=True)
y_train = train['Survived']
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
print dict_vec.feature_names_
print X_train
X_test = dict_vec.transform(X_test.to_dict(orient='record'))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from sklearn.cross_validation import cross_val_score
#cross_val_score(rfc, X_train, y_train, cv=5).mean()
cross_val_score(rfc, X_train, y_train, cv=5).mean()
#print test['PassengerId'], y_train)
#rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':rfc_y_predict})
X_test = X_test.astype(
# rfc_y_predict = rfc.predict(X_test)
rfc_y_predict = rfc.predict(X_test)
rfc_submisson = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
rfc_submisson.to_csv('rfc_submission.csv', index=False)
import pandas as pd
import re
from bs4 import BeautifulSoup
train = pd.read_csv('imdb/labeledTrainData.tsv', header=0,
delimiter="\t", quoting=3)
test = pd.read_csv('imdb/testData.tsv', header=0, delimiter="\t",
quoting=3 )
y_train = train['sentiment']
In [ ]:
def review_to_wordlist(review):
Meant for converting each of the IMDB reviews into a list of words.
# First remove the HTML.
review_text = BeautifulSoup(review, "html5lib").get_text()
# Use regular expressions to only include words.
review_text = re.sub("[^a-zA-Z]"," ", review_text)
# Convert words to lower case and split them into separate words.
words = review_text.lower().split()
# Return a list of words
print train.head()
traindata = []
for i in xrange(0,len(train['review'])):
traindata.append(" ".join(review_to_wordlist(train['review'][i])))
testdata = []
print test.head()
for i in xrange(0,len(test['review'])):
testdata.append(" ".join(review_to_wordlist(test['review'][i])))
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
tfv = TFIV(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata) # This is the slow part!
X_all = tfv.transform(X_all)
X = X_all[:lentrain] # Separate back into training and test sets.
X_test = X_all[lentrain:]
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV
grid_values = {'C':[30]} # Decide which settings you want for the grid search.
model_LR = GridSearchCV(LR(dual = True, random_state = 0),
grid_values, scoring = 'roc_auc', cv = 20)
# Try to set the scoring on what the contest is asking for.
# The contest says scoring is for area under the ROC curve, so use this.,y_train) # Fit the model.
In [1]:
import pandas as pd
In [2]:
train = pd.read_csv('minst/train.csv')
test = pd.read_csv('minst/test.csv')
In [3]:
print train.shape
print test.shape
In [4]:
y_train = train['label']
X_train = train.drop('label', axis=1)
In [5]:
X_test = test
import tensorflow as tf
import tensorlayer as tl
sess = tf.InteractiveSession()
# 准备数据
X_train, y_train, X_val, y_val, X_test, y_test = \
# 定义 placeholder
x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
y_ = tf.placeholder(tf.int64, shape=[None, ], name='y_')
# 定义模型
network = tl.layers.InputLayer(x, name='input_layer')
network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
network = tl.layers.DenseLayer(network, n_units=800,
act = tf.nn.relu, name='relu1')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop2')
network = tl.layers.DenseLayer(network, n_units=800,
act = tf.nn.relu, name='relu2')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop3')
network = tl.layers.DenseLayer(network, n_units=10,
act = tf.identity,
# 定义损失函数和衡量指标
# tl.cost.cross_entropy 在内部使用 tf.nn.sparse_softmax_cross_entropy_with_logits() 实现 softmax
y = network.outputs
cost = tl.cost.cross_entropy(y, y_, name = 'cost')
correct_prediction = tf.equal(tf.argmax(y, 1), y_)
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
y_op = tf.argmax(tf.nn.softmax(y), 1)
# 定义 optimizer
train_params = network.all_params
train_op = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999,
epsilon=1e-08, use_locking=False).minimize(cost, var_list=train_params)
# 初始化 session 中的所有参数
# 列出模型信息
# 训练模型, network, train_op, cost, X_train, y_train, x, y_,
acc=acc, batch_size=500, n_epoch=500, print_freq=5,
X_val=X_val, y_val=y_val, eval_train=False)
# 评估模型
tl.utils.test(sess, network, acc, X_test, y_test, x, y_, batch_size=None, cost=cost)
# 把模型保存成 .npz 文件
tl.files.save_npz(network.all_params , name='model.npz')
In [ ]: