In [5]:
import csv
from datetime import datetime, timedelta
import re
import nltk
INPUTFILE = 'outputgeo.csv'
MEETINGDATE_HEADER = 'Item Type Metadata:meeting date'
alldays = { 'today': timedelta(days=0),
'this day': timedelta(days=0),
'tomorrow': timedelta(days=1),
'to-morrow': timedelta(days=1),
'sunday': timedelta(days=1),
'monday': timedelta(days=2),
'tuesday': timedelta(days=3),
'wednesday': timedelta(days=4),
'thursday': timedelta(days=5),
'friday': timedelta(days=6),
'saturday': timedelta(days=7) }
def get_features(inputtext):
features = {}
matching = []
for token in inputtext.lower().split():
if token.strip(".") in alldays:
matching.append(token.strip("."))
# Feature 1 - number of 'date' days in the sentence:
features['datedays'] = len(matching)
# Feature 2 - Capitalised bit before the first full stop.
# So - percentage of characters that are capitalised:
frag = inputtext.split(".")[0]
features['uppercasefrag'] = sum(1 for c in frag if c.isupper()) / float(len(frag))
matches = re.findall(r"\n\n[A-Z\s\.\']+", inputtext)
if matches:
features['doublespccapitals'] = len([x for x in matches if len(x) > 6])
else:
features['doublespccapitals'] = 0
return features
from helpers import get_meeting_texts, get_negative_texts
# Positives
geodoc = get_meeting_texts()
meeting_set = [(get_features(doc), "meeting") for doc in geodoc]
# Negatives:
negs = get_negative_texts()
neg_set = [(get_features(doc), "nopes") for doc in negs]
print(meeting_set[0])
print(neg_set[0])
# Need a negative set - article chunks that are not meetings.
# Next, need to make a training set and a test set from the combination of the two.
train_set = meeting_set + neg_set
classifier = nltk.NaiveBayesClassifier.train(train_set)
# and how to use it:
guess = classifier.classify(get_features("TEXT. On Monday, at Manchester.\n\nTEXT. On Monday, at Manchester.\n\nTEXT. On to-morrow, at Manchester.\n\nTEXT. On Tuesday, at Manchester.\n\n"))
print(guess)
print(classifier.classify(get_features("""The Simpsons, X-Men.
Control panel size (control panel can be detached and rotated for doorways and shipping) - 36-3/4" wide x 14" deep.
Cabinet footprint - 28-1/4" wide x 34-1/4" deep.
Cabinet with control panel attached - 36-3/4" wide x 39" deep.
Heigth - 71" to 72" high, depending on casters.
Shipping pallet (with panel rotated) - 32" x 40" x 5", 40 lbs.
Cabinet weight with LCD monitor - 260 lbs.
Palleted shipping weight - about 310 lbs.
P1010005 (3)3. Nintendo classic 19" vertical and horizontal monitor upright cabinet.
Dedicated vertical games - Donkey Kong, Donkey Kong Jr., Donkey Kong 3.
Dedicated horizontal games - Most of the Vs. games, Vs. Super Mario Bros.
Size at control panel - 23-1/2" wide x 33-1/2" deep.
Heigth - 67" high, no casters.
Shipping pallet - 28" x 36" x 5", 35 lbs.
Weight - 180 lbs. with LCD/LED monitor.
Palleted shipping weight - about 225 lbs.
P1010004 (2)4. Golden Tee Golf 25" horizontal monitor upright cabinet.
Dedicated original games - Standard cabinet for all modern Golden Tee Golf games.
Footprint - 27-3/8" wide x 40" deep.
Size at control panel (as with #2 above, the control panel can be detached for tight clearances) - 29-3/4" wide x 43" deep.
Height - 75" minimum to 78" high, depending on pads or levelers.
Shipping pallet - 34" x 46" x 5".
Palleted shipping weight - about 325 lbs.
Size at control panel when outfitted with 4-player control panel (as with #2 above, the control panel will be detached and rotated for shipping) - 39" wide x 43" deep.
Note - We have available original Golden Tee refurbished cabinets (shown in photo above), and our brand new Golden Tee style cabinet.
Classics-cade5. Midway-style 19" vertical monitor cocktail cabinet.
Size at table top - 22" wide x 32" long.
Heigth - 29" table top to floor.
Weight - 100 lbs. with LCD/LED monitor.
Shipping pallet - 26" x 36" x 5".
Palleted shipping weight - about 150 lbs.
Killer instinct6. Midway competition style upright cabinet.
Dedicated original games - Killer Instinct (2-player), NBA Jam (4-player)
Footprint - 34" deep x 28-1/2" wide.
Size at control panel for 2-player cabinet - 36" deep x 30-1/2" wide.""")))
In [2]:
geodoc = get_meeting_texts()
txt = [i for i in geodoc][0]
matches = re.findall("\n\n[A-Z\s\.\']+", txt, re.MULTILINE|re.UNICODE)
print(matches)
In [ ]:
In [ ]: