In [106]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


/kaggle/input/train.csv
/kaggle/input/test.csv

In [107]:
df_train = pd.read_csv('/kaggle/input/train.csv')
df_test = pd.read_csv('/kaggle/input/test.csv')
df_train.shape


Out[107]:
(48192, 3)

In [108]:
df_test.head(2)


Out[108]:
review title target
0 I am from old town, and I stayed in this hotel... Incredible Hotel 5
1 We have been coming to the Ocean Park Inn for ... We Love this beach front Inn 5

In [109]:
#df_train = df_train.sample(10000, random_state=0)

In [110]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False)
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(df_train['title'])
#X_train_counts = tf_transformer.fit_transform(X_train_counts)

In [111]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(random_state=123)
classifier.fit(X_train_counts.toarray(), df_train['target'])


Out[111]:
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=123, verbose=0,
                     warm_start=False)

In [112]:
from sklearn.metrics import f1_score
f1_score(df_test['target'], classifier.predict(count_vect.transform(df_test['title'])), average='macro')


Out[112]:
0.4261187748670672