notebook.community

Edit and run



In [2]:

    
import csv, re, string
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline



In [15]:

    
PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
VALID_CLASSES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'T', 'X', 'Z']



In [16]:

    
data = []
with open('data/category-training.csv', 'r') as f:
    inputreader = csv.reader(f, delimiter=',', quotechar='"')
    for r in inputreader:
        # Concatenate the occupation and employer strings together and remove
        # punctuation. Both occupation and employer will be used in prediction.
        text = PUNCTUATION.sub('', ' '.join(r[0:2]))
        if len(r[2]) > 1 and r[2][0] in VALID_CLASSES:
            # We're only attempting to classify the first character of the
            # industry prefix ("A", "B", etc.) -- not the whole thing. That's
            # what the r[2][0] piece is about.
            data.append([text, r[2][0]])



In [18]:

    
texts = np.array([el[0] for el in data])
    classes = np.array([el[1] for el in data])



In [19]:

    
print texts









    



['Owner First Priority Title Llc' 'SENIOR PARTNER ARES MANAGEMENT'
 'CEO HB AGENCY' ..., 'INVESTMENT EXECUTIVE FEF MANAGEMENT LLC'
 'Owner Fair Funeral Home' 'ST MARTIN  LIRERRE LAW FIRM ']



In [20]:

    
print classes









    



['F' 'Z' 'Z' ..., 'F' 'G' 'K']



In [21]:

    
pipeline = Pipeline([
  ('vectorizer', CountVectorizer(
    ngram_range=(1,2),
    stop_words='english',
    min_df=2,
    max_df=len(texts))),
  ('classifier',  LogisticRegression())
])



In [22]:

    
pipeline.fit(np.asarray(texts), np.asarray(classes))









    Out[22]:





Pipeline(steps=[('vectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=66923, max_features=None, min_df=2,
        ngram_range=(1, 2), preprocessor=None, stop_words='english...',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0))])



In [27]:

    
print pipeline.predict(['LAWYER'])









    



['K']



In [28]:

    
print pipeline.predict(['SKADDEN ARPS'])









    



['K']



In [29]:

    
print pipeline.predict(['COMPUTER PROGRAMMER'])









    



['J']



In [ ]: