In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.grid_search import GridSearchCV
from IPython.display import Image
pd.set_option('chained_assignment', None)
plt.style.use('ggplot')
plt.rc('xtick.major', size=0)
plt.rc('ytick.major', size=0)
In [3]:
user_tags = pd.read_csv("user_tags_merge.csv")
In [4]:
user_tags
Out[4]:
In [6]:
X = user_tags[['nail', 'person', 'sport', 'food','hair', 'wedding']]
X['mix'] = X['hair'] + X['nail'] + X['wedding']
X.tail()
Out[6]:
In [7]:
y = user_tags['gender_male']
In [8]:
X=X.drop(['nail', 'sport','hair', 'wedding','mix'], axis=1)
In [9]:
X.tail()
Out[9]:
In [10]:
np.random.seed = 0
xmin, xmax = -2, 12
ymin, ymax = -2, 17
index_male = y[y==1].index
index_female = y[y==0].index
fig, ax = plt.subplots()
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
sc = ax.scatter(X.loc[index_male, 'food'],
X.loc[index_male, 'person']+(np.random.rand(len(index_male))-0.5)*0.1,
color='b', label='male', alpha=0.3)
sc = ax.scatter(X.loc[index_female, 'food'],
X.loc[index_female, 'person']+(np.random.rand(len(index_female))-0.5)*0.1,
color='r', label='female', alpha=0.3)
ax.set_xlabel('food')
ax.set_ylabel('person')
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)
ax.legend(bbox_to_anchor=(1.4, 1.03))
plt.show()
In [11]:
X = user_tags[['nail', 'person', 'sport', 'food','coffee','cake','beer','sky']]
y = user_tags["gender_male"]
In [12]:
clf = LogisticRegression()
def cross_val(clf, X, y, K, random_state=0):
cv = KFold(len(y), K, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=cv)
return scores
scores = cross_val(clf, X, y, 6)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [13]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=2)
scores = cross_val(clf, X, y, 5)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
In [271]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
In [15]:
clf = SVC(kernel='rbf', C=100)
X = user_tags[['nail', 'person', 'sport', 'food','coffee','wedding','cake','beer']]
y = user_tags["gender_male"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=1)
clf.fit(X_train, y_train)
Out[15]:
In [16]:
clf.predict(X_val)
Out[16]:
In [17]:
y_val
clf.score(X_val, y_val)
Out[17]:
In [18]:
XX = user_tags
XX.tail()
Out[18]:
In [19]:
XX=XX.drop(['user_id', 'user_name','gender_male'], axis=1)
In [20]:
X
Out[20]:
In [21]:
XX_train, XX_val, y_train, y_val = train_test_split(XX, y, train_size=0.5, random_state=1)
In [22]:
clf = LogisticRegression()
def cross_val(clf, XX, y, K, random_state=0):
cv = KFold(len(y), K, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, XX, y, cv=cv)
return scores
scores = cross_val(clf, XX, y, 3)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [5]:
X = user_tags[['nail','hair', 'person', 'sport', 'food','night','coffee','wedding','cake','beer', 'dog', 'animal', 'tree','blossom','cat', 'flower','sky','nature','cherry']]
y = user_tags["gender_male"]
In [6]:
X['animal']=X['animal']+X['dog']+X['cat']
X['cosme']=X['hair']+X['nail']
X['nature']=X['nature']+X['sky']+X['flower']+X['tree']+X['blossom']+X['cherry']
X = X.drop(['nail','hair', 'dog', 'cat', 'sky','flower','tree','blossom','cherry'],axis=1)
In [7]:
X.tail()
Out[7]:
In [8]:
clf = LogisticRegression()
def cross_val(clf, X, y, K, random_state=0):
cv = KFold(len(y), K, shuffle=True, random_state=random_state)
scores = cross_val_score(clf, X, y, cv=cv)
return scores
for i in range(2,12):
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [29]:
clf = SVC(kernel='rbf', C=1000)
for i in range(2,12):
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [30]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_leaf=2)
for i in range(2,12):
scores = cross_val(clf, X, y, i)
print(i)
print('Scores:', scores)
print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
In [11]:
from sklearn.externals import joblib
In [9]:
clf = LogisticRegression()
clf.fit(X,y)
Out[9]:
In [12]:
joblib.dump(clf, 'clf.pkl')
Out[12]:
In [35]:
X.tail()
Out[35]:
In [ ]: