In [2]:
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

preprocess is a multi-language preprocessor The sklearn.preprocessing package provides several common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators.

sklearn(scikit-learn) package can't handle gender data because it has string (male and female inside), so need to be convered to boolean (0 and 1).

In scikit-learn a random split into training and test (to avoid the cases of overfitting) sets can be quickly computed with the train_test_split helper function.


In [3]:
########## STEP 1: DATA IMPORT AND PREPROCESSING ##########

# Here we're taking in the training data and splitting it into two lists: One with the text of
# each bill title, and the second with each bill title's corresponding category. Order is important.
# The first bill in list 1 should also be the first category in list 2.
training = [line.strip().split('|') for line in open('bills_training.txt', 'r', encoding="utf8").readlines()]
text = [t[0] for t in training if len(t) > 1]
labels = [t[1] for t in training if len(t) > 1]

# A little bit of cleanup for scikit-learn's benefit. Scikit-learn models wants our categories to
# be numbers, not strings. The LabelEncoder performs this transformation.
encoder = preprocessing.LabelEncoder()
correct_labels = encoder.fit_transform(labels)

In [4]:
# is it a random number or number of occurrence ?
correct_labels


Out[4]:
array([10, 31, 25, ..., 19, 19, 27], dtype=int32)

In [6]:
########## STEP 2: FEATURE EXTRACTION ##########
vectorizer = CountVectorizer(stop_words='english')
data = vectorizer.fit_transform(text)

In [12]:
# what exactly is the number corresponding to the category 0,1,2.... what's number 1 stands for which is outside the tuple
print(data)


  (0, 4986)	1
  (0, 5059)	1
  (0, 7052)	1
  (0, 3241)	1
  (0, 5719)	1
  (0, 5391)	1
  (0, 6894)	1
  (0, 7242)	1
  (1, 4986)	1
  (1, 7052)	1
  (1, 5391)	1
  (1, 6894)	1
  (1, 4995)	1
  (1, 4617)	1
  (1, 5970)	1
  (1, 6808)	1
  (1, 5933)	1
  (2, 4986)	1
  (2, 5059)	1
  (2, 7052)	1
  (2, 5391)	1
  (2, 6894)	1
  (2, 4995)	1
  (2, 7053)	1
  (2, 2036)	1
  :	:
  (5743, 6882)	1
  (5743, 1776)	1
  (5744, 6040)	1
  (5744, 6896)	1
  (5744, 6453)	1
  (5744, 5209)	1
  (5745, 6396)	1
  (5745, 5263)	1
  (5745, 5742)	1
  (5746, 6016)	1
  (5746, 5288)	1
  (5746, 5525)	1
  (5747, 6396)	1
  (5747, 5263)	1
  (5747, 5742)	1
  (5748, 6016)	1
  (5748, 5288)	1
  (5748, 5525)	1
  (5749, 948)	1
  (5749, 7069)	1
  (5749, 5829)	1
  (5749, 6204)	1
  (5749, 6896)	1
  (5749, 7002)	1
  (5749, 1776)	1

In [9]:
########## STEP 3: MODEL BUILDING ##########
model = DecisionTreeClassifier()
fit_model = model.fit(data, correct_labels)

In [11]:
print(fit_model)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [10]:
# ########## STEP 4: EVALUATION ##########
# Evaluate our model with 10-fold cross-validation
scores = cross_validation.cross_val_score(model, data, correct_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\cross_validation.py:516: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=5.
  % (min_labels, self.n_folds)), Warning)
Accuracy: 0.65 (+/- 0.05)

In [11]:
# ########## STEP 5: APPLYING THE MODEL ##########
docs_new = ["Public postsecondary education: executive officer compensation.",
            "An act to add Section 236.3 to the Education code, related to the pricing of college textbooks.",
            "Political Reform Act of 1974: campaign disclosures.",
            "An act to add Section 236.3 to the Penal Code, relating to human trafficking."
        ]

test_data = vectorizer.transform(docs_new)

for i in range(len(docs_new)):
    print('%s -> %s' % (docs_new[i], encoder.classes_[model.predict(test_data.toarray()[i])]))


Public postsecondary education: executive officer compensation. -> ['Education']
An act to add Section 236.3 to the Education code, related to the pricing of college textbooks. -> ['Education']
Political Reform Act of 1974: campaign disclosures. -> ['Campaign Finance and Election Issues']
An act to add Section 236.3 to the Penal Code, relating to human trafficking. -> ['Crime']
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
c:\users\radhika\appdata\local\programs\python\python35-32\lib\site-packages\sklearn\utils\validation.py:386: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

In [ ]: