In [1]:
import IPython
import os
import csv
import subprocess
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx
import itertools
from collections import Counter, defaultdict
from sklearn import metrics, cross_validation
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import svm
from tsa import stdout, stderr, root
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary
import geo
import geo.types
import geo.shapefile.reader
import geo.shapefile.writer
import geojson
from tsa import logging
logger = logging.getLogger(__name__)
head = lambda x: x[0]
# lon = x = easting; lat = y = northing
In [2]:
session = create_session()
documents = session.query(Document).join(Source).\
filter(Source.name == 'sb5b').all()
full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label or 'Unlabeled')
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
ngram_max=2, min_df=2, max_df=1.0)
def doc_has_lon_lat(doc):
return doc.details.get('Longitude') is not None and doc.details.get('Latitude') is not None
geolocated_indices = np.array([doc_has_lon_lat(doc) for doc in full_corpus.data])
labeled_geolocated_indices = np.array([doc_has_lon_lat(doc) and doc.label is not None
for doc in full_corpus.data])
print 'Number of geo-located tweets: {:d} ({:.2%})'.format(
geolocated_indices.sum(), geolocated_indices.mean())
print 'Number of labeled geo-located tweets: {:d} ({:.2%})'.format(
labeled_geolocated_indices.sum(), labeled_geolocated_indices.mean())
In [3]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
labeled_corpus = full_corpus.subset(polar_indices)
logreg_model = linear_model.LogisticRegression(fit_intercept=True, penalty='l2')
logreg_model.fit(labeled_corpus.X, labeled_corpus.y)
Out[3]:
In [4]:
geolocated_corpus = full_corpus.subset(geolocated_indices)
geolocated_pred_y = logreg_model.predict(geolocated_corpus.X)
# geolocated_pred_proba = logreg_model.predict_proba(geolocated_corpus.X)
In [5]:
# countyp020.shp has fields like: 'PERIMETER': 1.82, 'STATE_FIPS': '39',
# 'AREA': 0.199, 'COUNTY': 'Ashtabula County', 'STATE': 'OH', 'FIPS': '39007',
# 'COUNTYP020': 2539, 'SQUARE_MIL': 710.321}
counties_filepath = '/Users/chbrown/corpora-public/census-shapefiles/countyp020/countyp020.shp'
# counties_filepath = '/Users/chbrown/corpora-public/census-shapefiles/ESRI-UScounties/UScounties.shp'
shapefile_reader = geo.shapefile.reader.Reader(counties_filepath)
In [6]:
def get_ohio_counties(shapefile_reader):
for record, shape in zip(shapefile_reader.records(), shapefile_reader.shapes()):
attributes = dict(zip(shapefile_reader.field_names, record))
state = attributes['STATE']
county = attributes['COUNTY'].replace('County', '').strip()
if state == 'OH' and county:
yield county, attributes, shape
In [7]:
grouped = itertools.groupby(sorted(get_ohio_counties(shapefile_reader), key=head), head)
features = []
for county, group in grouped:
group = list(group)
#print county, len(group)
if len(group) == 1:
geometry = group[0][2].__geo_interface__
else:
geometry = geojson.MultiPolygon([item[2].__geo_interface__['coordinates'] for item in group])
properties = dict(county=county, FIPS=group[0][1]['FIPS'], area_sq_mi=group[0][1]['SQUARE_MIL'])
#bbox = geo.types.BoundingBox(*group[0][2].bbox)
bbox = None
features += [geo.types.Feature(geometry, properties, id=county, bbox=bbox)]
feature_collection = geo.types.FeatureCollection(features)
out_of_state = geo.types.Feature(dict(type=None), dict(FIPS=None), id='Out-of-state')
feature_collection.features += [out_of_state]
In [8]:
counties = [
('Ottawa', feature_collection['Ottawa']),
('Franklin', feature_collection['Franklin']),
]
cities = [
('Austin', (-97.75, 30.25)),
('Columbus', (-82.98, 39.98)),
('Port Clinton', (-82.9433, 41.5063)), # in ottawa county, which has islands
]
In [9]:
# for feature in feature_collection.features:
# print feature.id, feature.bbox
# ottawa = feature_collection['Ottawa']
# ottawa.contains(-82.9433, 41.5063)
In [10]:
#parts = shape.parts.tolist() + [len(shape.points)]
#polygons = [shape.points[i:j] for i, j in zip(parts, parts[1:])]
print 'Testing county coverage...'
for county, feature in counties:
for city, (lon, lat) in cities:
print '{:s} contains {:s}? {:s}'.format(
county, city, str(feature.contains(lon, lat)))
add counts for geolocated tweets
In [13]:
def location(document, feature_collection):
lon = document.details.get('Longitude')
lat = document.details.get('Latitude')
features = list(feature_collection.features_containing(lon, lat))
# country_match = countries.first_area_containing(lon, lat)
in_state = len(features) == 1
return features[0].id if in_state else 'Out-of-state'
for feature in feature_collection.features:
feature.properties['tweets'] = dict(For=0, Against=0, Total=0)
feature.properties['labeled_tweets'] = dict(For=0, Against=0, Total=0)
# geolocated_counties = np.array([location(document) for document in geolocated_corpus.data])
geolocated_pred_labels = geolocated_corpus.labels[geolocated_pred_y]
for pred_label, document in zip(geolocated_pred_labels, geolocated_corpus.data):
feature_id = location(document, feature_collection)
feature = feature_collection[feature_id]
feature.properties['tweets'][pred_label] += 1
feature.properties['tweets']['Total'] += 1
if document.label in ['For', 'Against']:
feature.properties['labeled_tweets'][pred_label] += 1
feature.properties['labeled_tweets']['Total'] += 1
add population data
In [12]:
fips2county = dict((feature.properties['FIPS'], feature.id) for feature in feature_collection.features)
reader = csv.DictReader(open('/Users/chbrown/corpora/ohio/census/DataSet.txt'))
for row in reader:
fips = row['fips']
# POP010210 = Population, 2010
population = row['POP010210']
ohio_county = fips2county.get(fips)
if ohio_county:
county_feature = feature_collection[ohio_county]
county_feature.properties['population'] = int(population)
def feature_population(feature):
return feature.properties.get('population', 0)
# for feature in sorted(feature_collection.features, key=feature_population):
# print feature.id, feature_population(feature)
total_pop = sum(feature.properties.get('population', 0) for feature in feature_collection.features)
print total_pop, 'vs. Wikipedia\'s "2010 = 11536504"'
add sb5 votes
In [20]:
reader = csv.DictReader(open('/Users/chbrown/corpora/ohio/ohio-secretaryofstate/issue2.tsv'), dialect='excel-tab')
for row in reader:
feature_id = row['County']
feature = feature_collection[feature_id]
feature.properties['votes'] = dict(For=int(row['Yes']), Against=int(row['No']), Total=int(row['Yes']) + int(row['No']))
In [21]:
# geojson_path = os.path.join(root, 'static', 'oh-counties.geo.json')
geojson_encoder = geo.types.GeoEncoder(indent=None)
geojson_string = geojson_encoder.encode(feature_collection)
# with open(geojson_path, 'w') as geojson_fd:
# geojson_fd.write()
topojson_path = os.path.join(root, 'static', 'oh-counties.topo.json')
from subprocess import PIPE
proc = subprocess.Popen(['topojson', '--properties', '--out', topojson_path],
stdin=PIPE, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate(geojson_string)
print 'Wrote feature_collection to GeoJSON and converted to TopoJSON'
print stdout, stderr
In [ ]: