ClassifierTraining



In [5]:
import csv
from datetime import datetime, timedelta

import re

import nltk

INPUTFILE = 'outputgeo.csv'

MEETINGDATE_HEADER = 'Item Type Metadata:meeting date'

alldays = { 'today': timedelta(days=0),
            'this day': timedelta(days=0),
            'tomorrow': timedelta(days=1),
            'to-morrow': timedelta(days=1),
            'sunday': timedelta(days=1),
            'monday': timedelta(days=2),
            'tuesday': timedelta(days=3),
            'wednesday': timedelta(days=4),
            'thursday': timedelta(days=5),
            'friday': timedelta(days=6),
            'saturday': timedelta(days=7) }

def get_features(inputtext):
    features = {}
    
    matching = []
    for token in inputtext.lower().split():
        if token.strip(".") in alldays:
            matching.append(token.strip("."))
            
    # Feature 1 - number of 'date' days in the sentence:
    features['datedays'] = len(matching)
    
    # Feature 2 - Capitalised bit before the first full stop.
    # So - percentage of characters that are capitalised:
    frag = inputtext.split(".")[0]
    features['uppercasefrag'] = sum(1 for c in frag if c.isupper()) / float(len(frag))
    
    matches = re.findall(r"\n\n[A-Z\s\.\']+", inputtext)
    if matches:
        features['doublespccapitals'] = len([x for x in matches if len(x) > 6])
    else:
        features['doublespccapitals'] = 0
    
    return features

from helpers import get_meeting_texts, get_negative_texts

# Positives
geodoc = get_meeting_texts()

meeting_set = [(get_features(doc), "meeting") for doc in geodoc]

# Negatives:
negs = get_negative_texts()

neg_set = [(get_features(doc), "nopes") for doc in negs]

print(meeting_set[0])
print(neg_set[0])

# Need a negative set - article chunks that are not meetings.

# Next, need to make a training set and a test set from the combination of the two.

train_set = meeting_set + neg_set

classifier = nltk.NaiveBayesClassifier.train(train_set)

# and how to use it:

guess = classifier.classify(get_features("TEXT. On Monday, at Manchester.\n\nTEXT. On Monday, at Manchester.\n\nTEXT. On to-morrow, at Manchester.\n\nTEXT. On Tuesday, at Manchester.\n\n"))
print(guess)

print(classifier.classify(get_features("""The Simpsons, X-Men.

Control panel size (control panel can be detached and rotated for doorways and shipping) - 36-3/4" wide x 14" deep.

Cabinet footprint - 28-1/4" wide x 34-1/4" deep.

Cabinet with control panel attached - 36-3/4" wide x 39" deep.

Heigth - 71" to 72" high, depending on casters.

Shipping pallet (with panel rotated) - 32" x 40" x 5", 40 lbs.

Cabinet weight with LCD monitor - 260 lbs.

Palleted shipping weight - about 310 lbs.

 

 

P1010005 (3)3.  Nintendo classic 19" vertical and horizontal monitor upright cabinet.

Dedicated vertical games - Donkey Kong, Donkey Kong Jr., Donkey Kong 3.

Dedicated horizontal games - Most of the Vs. games, Vs. Super Mario Bros.

Size at control panel - 23-1/2" wide x 33-1/2" deep.

Heigth - 67" high, no casters.

Shipping pallet - 28" x 36" x 5", 35 lbs.

Weight - 180 lbs. with LCD/LED monitor.

Palleted shipping weight - about 225 lbs.

 

 

P1010004 (2)4.  Golden Tee Golf 25" horizontal monitor upright cabinet.

Dedicated original games - Standard cabinet for all modern Golden Tee Golf games.

Footprint - 27-3/8" wide x 40" deep.

Size at control panel (as with #2 above, the control panel can be detached for tight clearances) - 29-3/4" wide x 43" deep.

Height - 75" minimum to 78" high, depending on pads or levelers.

Shipping pallet - 34" x 46" x 5".

Palleted shipping weight - about 325 lbs.

Size at control panel when outfitted with 4-player control panel (as with #2 above, the control panel will be detached and rotated for shipping) - 39" wide x 43" deep.

Note - We have available original Golden Tee refurbished cabinets (shown in photo above), and our brand new Golden Tee style cabinet.

 


Classics-cade5.  Midway-style 19" vertical monitor cocktail cabinet.

Size at table top - 22" wide x 32" long.

Heigth - 29" table top to floor.

Weight - 100 lbs. with LCD/LED monitor.

Shipping pallet - 26" x 36" x 5".

Palleted shipping weight - about 150 lbs.

 

 

 Killer instinct6.  Midway competition style upright cabinet.

Dedicated original games - Killer Instinct (2-player), NBA Jam (4-player)

Footprint - 34" deep x 28-1/2" wide.

Size at control panel for 2-player cabinet - 36" deep x 30-1/2" wide.""")))


({'uppercasefrag': 1.0, 'datedays': 41, 'doublespccapitals': 32}, 'meeting')
({'uppercasefrag': 0.013333333333333334, 'datedays': 0, 'doublespccapitals': 0}, 'nopes')
meeting
nopes

In [2]:
geodoc = get_meeting_texts()
txt = [i for i in geodoc][0]
matches = re.findall("\n\n[A-Z\s\.\']+", txt, re.MULTILINE|re.UNICODE)
print(matches)


['\n\nWEST RIDING. ', '\n\nDEWSBURY. ', "\n\nMR. W. D. TAYLOR'S ROUTE. ", '\n\nOLDHAM. ', '\n\nMANCHESTER. ', '\n\nHOLBECK. ', '\n\nUPPER WORTLEY. ', '\n\nLONDON. ', '\n\nMR. STALLWOOD ', '\n\nLAMBETH. ', '\n\nMR. WILLIAM JONES', '\n\nDELPH. ', '\n\nCHOWBENT. ', '\n\nSTALYBRIDGE. ', '\n\nOLDHAM. ', '\n\nROCHDALE. ', '\n\nMR. LINNEY', '\n\nMR. BAIRSTOW ', '\n\nBATH. ', '\n\nNEWPORT AND PONTYPOOL. ', '\n\nMACCLESFIELD. ', "\n\nMR. DOYLE'S ROUTE ", '\n\nCHELSEA. ', '\n\nSHOREDITCH. ', '\n\nMARPLE. ', '\n\nWINCHCOMB. ', '\n\nNOTTINGHAM. ', '\n\nSTOCKPORT. ', '\n\nBRADFORD. ', '\n\nHUNSLET. ', '\n\nUPPER WORTLEY. ', '\n\nBARNSLEY. ']

In [ ]:


In [ ]: