In [ ]:
import numpy as np
import pandas as pd
In [ ]:
df = pd.read_csv('breast-cancer-wisconsin.data', names=[
'Sample code number',
'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size'
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class'
])
In [ ]:
df.head()
In [ ]:
df.shape
In [ ]:
df.info()
In [ ]:
df = df.replace(to_replace='?', value=np.nan)
In [ ]:
df.isnull().sum()
In [ ]:
df.dropna(how='any', inplace=True)
In [ ]:
df.shape
In [ ]:
df.isnull().sum()
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns[0:9]], df[df.columns[9]], test_size=0.25, random_state=33)
In [ ]:
y_train.value_counts()
In [ ]:
y_test.value_counts()
In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
In [ ]:
X_train
In [ ]:
lr = LogisticRegression()
sgdb = SGDClassifier()
In [ ]:
lr.fit(X_train, y_train)
In [ ]:
lr_y_predict = lr.predict(X_test)
In [ ]:
lr_y_predict
In [ ]:
y_test
In [ ]:
sgdb.fit(X_train, y_train)
In [ ]:
sgdb_y_predict = sgdb.predict(X_test)
In [ ]:
sgdb_y_predict
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
print 'Accuracy of LR', lr.score(X_test, y_test)
In [ ]:
print classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])
In [ ]:
print 'Accuracy of SGDC', sgdb.score(X_test, y_test)
In [ ]:
print classification_report(y_test, sgdb_y_predict, target_names=['Benign', 'Malignant'])
In [ ]:
from sklearn.datasets import load_digits
In [ ]:
digits = load_digits()
digits.data.shape
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.25, random_state=33)
In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
In [ ]:
lsvc = LinearSVC()
In [ ]:
lsvc.fit(X_train, y_train)
In [ ]:
y_predict = lsvc.predict(X_test)
In [ ]:
y_predict
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
print 'Accuracy of SVM', lsvc.score(X_test, y_test)
In [ ]:
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))
In [ ]:
from sklearn.datasets import load_iris
In [ ]:
iris = load_iris()
iris.data.shape
In [ ]:
iris.DESCR
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.25, random_state=33)
In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
In [ ]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.fit_transform(X_test)
In [ ]:
knc = KNeighborsClassifier()
knc.fit(X_train, y_train)
In [ ]:
y_predict = knc.predict(X_test)
In [ ]:
from sklearn.metrics import classification_report
print 'Accuracy of K-Nearest Neighbor', knc.score(X_test, y_test)
In [ ]:
print classification_report(y_test, y_predict, target_names=digits.target_names.astype(str))
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [ ]:
digits_train = pd.read_csv('optdigits.tra', header=None)
In [ ]:
digits_test = pd.read_csv('optdigits.tes', header=None)
In [ ]:
X_train = digits_train[np.arange(64)]
y_train = digits_train[64]
In [ ]:
X_test = digits_test[np.arange(64)]
y_test = digits_test[64]
In [ ]:
from sklearn.cluster import KMeans
In [ ]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_train)
In [ ]:
y_pred = kmeans.predict(X_test)
In [ ]:
from sklearn import metrics
In [ ]:
print metrics.adjusted_rand_score(y_test, y_pred)
In [ ]:
sent1 = 'The cat is walking in the bedroom'
sent2 = 'A dog was running across the kitchen.'
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
count_vec = CountVectorizer()
In [ ]:
sentences = [sent1, sent2]
In [ ]:
print count_vec.fit_transform(sentences).toarray()
In [ ]:
print count_vec.get_feature_names()
In [ ]:
import nltk
#nltk.download() # download tokenizers/punkt/english.pickle
In [ ]:
tokens_l = nltk.word_tokenize(sent1)
print tokens_l
Variable Definition Key
survival Survival 0 = No, 1 = Yes
pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
sex Sex
Age Age in years
sibsp # of siblings / spouses aboard the Titanic
parch # of parents / children aboard the Titanic
ticket Ticket number
fare Passenger fare
cabin Cabin number
embarked Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
Variable Notes
pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower
age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5
sibsp: The dataset defines family relations in this way... Sibling = brother, sister, stepbrother, stepsister Spouse = husband, wife (mistresses and fiancés were ignored)
parch: The dataset defines family relations in this way... Parent = mother, father Child = daughter, son, stepdaughter, stepson Some children travelled only with a nanny, therefore parch=0 for them.
In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [ ]:
train = pd.read_csv('titanic/train.csv')
#print train.info()
test = pd.read_csv('titanic/test.csv')
#print test.info()
In [ ]:
train.isnull().any()
In [ ]:
features = ['Pclass','Sex','Age','SibSp','Parch','Embarked','Fare']
X_train = train[features]
X_test = test[features]
In [ ]:
X_train['Age'].fillna(train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna('S', inplace=True)
X_test['Age'].fillna(train['Age'].mean(), inplace=True)
X_test['Embarked'].fillna('S', inplace=True)
y_train = train['Survived']
In [ ]:
X_train.isnull().any()
In [ ]:
X_train.shape
In [ ]:
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record'))
print dict_vec.feature_names_
print X_train
In [ ]:
X_test = dict_vec.transform(X_test.to_dict(orient='record'))
In [ ]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
In [ ]:
from sklearn.cross_validation import cross_val_score
In [ ]:
#cross_val_score(rfc, X_train, y_train, cv=5).mean()
In [ ]:
cross_val_score(rfc, X_train, y_train, cv=5).mean()
In [ ]:
#print test['PassengerId']
rfc.fit(X_train, y_train)
#rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':rfc_y_predict})
In [ ]:
X_test = X_test.astype(np.int)
# rfc_y_predict = rfc.predict(X_test)
In [ ]:
rfc_y_predict = rfc.predict(X_test)
In [ ]:
rfc_y_predict
In [ ]:
rfc_submisson = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
In [ ]:
rfc_submisson.to_csv('rfc_submission.csv', index=False)
In [ ]:
import pandas as pd
import re
from bs4 import BeautifulSoup
In [ ]:
train = pd.read_csv('imdb/labeledTrainData.tsv', header=0,
delimiter="\t", quoting=3)
test = pd.read_csv('imdb/testData.tsv', header=0, delimiter="\t",
quoting=3 )
In [ ]:
y_train = train['sentiment']
In [ ]:
def review_to_wordlist(review):
'''
Meant for converting each of the IMDB reviews into a list of words.
'''
# First remove the HTML.
review_text = BeautifulSoup(review, "html5lib").get_text()
# Use regular expressions to only include words.
review_text = re.sub("[^a-zA-Z]"," ", review_text)
# Convert words to lower case and split them into separate words.
words = review_text.lower().split()
# Return a list of words
return(words)
In [ ]:
print train.head()
traindata = []
for i in xrange(0,len(train['review'])):
traindata.append(" ".join(review_to_wordlist(train['review'][i])))
testdata = []
print test.head()
for i in xrange(0,len(test['review'])):
testdata.append(" ".join(review_to_wordlist(test['review'][i])))
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
In [ ]:
tfv = TFIV(min_df=3, max_features=None,
strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
stop_words = 'english')
In [ ]:
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata)
tfv.fit(X_all) # This is the slow part!
X_all = tfv.transform(X_all)
X = X_all[:lentrain] # Separate back into training and test sets.
X_test = X_all[lentrain:]
In [ ]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV
In [ ]:
grid_values = {'C':[30]} # Decide which settings you want for the grid search.
model_LR = GridSearchCV(LR(dual = True, random_state = 0),
grid_values, scoring = 'roc_auc', cv = 20)
# Try to set the scoring on what the contest is asking for.
# The contest says scoring is for area under the ROC curve, so use this.
model_LR.fit(X,y_train) # Fit the model.
In [ ]:
model_LR.grid_scores_
In [ ]:
model_LR.best_estimator_
In [1]:
import pandas as pd
In [2]:
train = pd.read_csv('minst/train.csv')
test = pd.read_csv('minst/test.csv')
In [3]:
print train.shape
print test.shape
In [4]:
y_train = train['label']
X_train = train.drop('label', axis=1)
In [5]:
X_test = test
In [ ]:
import tensorflow as tf
import tensorlayer as tl
sess = tf.InteractiveSession()
# 准备数据
X_train, y_train, X_val, y_val, X_test, y_test = \
tl.files.load_mnist_dataset(shape=(-1,784))
# 定义 placeholder
x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
y_ = tf.placeholder(tf.int64, shape=[None, ], name='y_')
# 定义模型
network = tl.layers.InputLayer(x, name='input_layer')
network = tl.layers.DropoutLayer(network, keep=0.8, name='drop1')
network = tl.layers.DenseLayer(network, n_units=800,
act = tf.nn.relu, name='relu1')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop2')
network = tl.layers.DenseLayer(network, n_units=800,
act = tf.nn.relu, name='relu2')
network = tl.layers.DropoutLayer(network, keep=0.5, name='drop3')
network = tl.layers.DenseLayer(network, n_units=10,
act = tf.identity,
name='output_layer')
# 定义损失函数和衡量指标
# tl.cost.cross_entropy 在内部使用 tf.nn.sparse_softmax_cross_entropy_with_logits() 实现 softmax
y = network.outputs
cost = tl.cost.cross_entropy(y, y_, name = 'cost')
correct_prediction = tf.equal(tf.argmax(y, 1), y_)
acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
y_op = tf.argmax(tf.nn.softmax(y), 1)
# 定义 optimizer
train_params = network.all_params
train_op = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999,
epsilon=1e-08, use_locking=False).minimize(cost, var_list=train_params)
# 初始化 session 中的所有参数
tl.layers.initialize_global_variables(sess)
# 列出模型信息
network.print_params()
network.print_layers()
# 训练模型
tl.utils.fit(sess, network, train_op, cost, X_train, y_train, x, y_,
acc=acc, batch_size=500, n_epoch=500, print_freq=5,
X_val=X_val, y_val=y_val, eval_train=False)
# 评估模型
tl.utils.test(sess, network, acc, X_test, y_test, x, y_, batch_size=None, cost=cost)
# 把模型保存成 .npz 文件
tl.files.save_npz(network.all_params , name='model.npz')
sess.close()
In [ ]: