In [106]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# Any results you write to the current directory are saved as output.
In [107]:
df_train = pd.read_csv('/kaggle/input/train.csv')
df_test = pd.read_csv('/kaggle/input/test.csv')
df_train.shape
Out[107]:
In [108]:
df_test.head(2)
Out[108]:
In [109]:
#df_train = df_train.sample(10000, random_state=0)
In [110]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train['title'])
#X_train_counts = tf_transformer.fit_transform(X_train_counts)
In [111]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(random_state=123)
classifier.fit(X_train_counts.toarray(), df_train['target'])
Out[111]:
In [112]:
from sklearn.metrics import f1_score
f1_score(df_test['target'], classifier.predict(count_vect.transform(df_test['title'])), average='macro')
Out[112]: