In [1]:
import IPython
import os
import csv
import subprocess
import numpy as np
import pandas as pd
from tsa.science import numpy_ext as npx

import itertools
from collections import Counter, defaultdict

from sklearn import metrics, cross_validation
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import svm

from tsa import stdout, stderr, root
from tsa.lib import tabular, datetime_extra
from tsa.lib.timer import Timer
from tsa.models import Source, Document, create_session
from tsa.science import features, models, timeseries
from tsa.science.corpora import MulticlassCorpus
from tsa.science.plot import plt, figure_path, distinct_styles, ticker
from tsa.science.summarization import metrics_dict, metrics_summary

import geo
import geo.types
import geo.shapefile.reader
import geo.shapefile.writer
import geojson

from tsa import logging
logger = logging.getLogger(__name__)

head = lambda x: x[0]

# lon = x = easting; lat = y = northing

In [2]:
session = create_session()
documents = session.query(Document).join(Source).\
    filter(Source.name == 'sb5b').all()

full_corpus = MulticlassCorpus(documents)
full_corpus.apply_labelfunc(lambda doc: doc.label or 'Unlabeled')
full_corpus.extract_features(lambda doc: doc.document, features.ngrams,
    ngram_max=2, min_df=2, max_df=1.0)

def doc_has_lon_lat(doc):
    return doc.details.get('Longitude') is not None and doc.details.get('Latitude') is not None

geolocated_indices = np.array([doc_has_lon_lat(doc) for doc in full_corpus.data])
labeled_geolocated_indices = np.array([doc_has_lon_lat(doc) and doc.label is not None
                                       for doc in full_corpus.data])

print 'Number of geo-located tweets: {:d} ({:.2%})'.format(
    geolocated_indices.sum(), geolocated_indices.mean())
print 'Number of labeled geo-located tweets: {:d} ({:.2%})'.format(
    labeled_geolocated_indices.sum(), labeled_geolocated_indices.mean())


Number of geo-located tweets: 3769 (3.53%)
Number of labeled geo-located tweets: 247 (0.23%)

In [3]:
polar_classes = [full_corpus.class_lookup[label] for label in ['For', 'Against']]
polar_indices = np.in1d(full_corpus.y, polar_classes)
labeled_corpus = full_corpus.subset(polar_indices)

logreg_model = linear_model.LogisticRegression(fit_intercept=True, penalty='l2')
logreg_model.fit(labeled_corpus.X, labeled_corpus.y)


Out[3]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [4]:
geolocated_corpus = full_corpus.subset(geolocated_indices)
geolocated_pred_y = logreg_model.predict(geolocated_corpus.X)
# geolocated_pred_proba = logreg_model.predict_proba(geolocated_corpus.X)

In [5]:
# countyp020.shp has fields like: 'PERIMETER': 1.82, 'STATE_FIPS': '39',
#   'AREA': 0.199, 'COUNTY': 'Ashtabula County', 'STATE': 'OH', 'FIPS': '39007',
#   'COUNTYP020': 2539, 'SQUARE_MIL': 710.321}  
counties_filepath = '/Users/chbrown/corpora-public/census-shapefiles/countyp020/countyp020.shp'
# counties_filepath = '/Users/chbrown/corpora-public/census-shapefiles/ESRI-UScounties/UScounties.shp'
shapefile_reader = geo.shapefile.reader.Reader(counties_filepath)

In [6]:
def get_ohio_counties(shapefile_reader):
    for record, shape in zip(shapefile_reader.records(), shapefile_reader.shapes()):
        attributes = dict(zip(shapefile_reader.field_names, record))
        state = attributes['STATE']
        county = attributes['COUNTY'].replace('County', '').strip()
        if state == 'OH' and county:
            yield county, attributes, shape

In [7]:
grouped = itertools.groupby(sorted(get_ohio_counties(shapefile_reader), key=head), head)

features = []
for county, group in grouped:
    group = list(group)
    #print county, len(group)
    if len(group) == 1:
        geometry = group[0][2].__geo_interface__
    else:
        geometry = geojson.MultiPolygon([item[2].__geo_interface__['coordinates'] for item in group])
    properties = dict(county=county, FIPS=group[0][1]['FIPS'], area_sq_mi=group[0][1]['SQUARE_MIL'])
    #bbox = geo.types.BoundingBox(*group[0][2].bbox)
    bbox = None
    features += [geo.types.Feature(geometry, properties, id=county, bbox=bbox)]

feature_collection = geo.types.FeatureCollection(features)
out_of_state = geo.types.Feature(dict(type=None), dict(FIPS=None), id='Out-of-state')
feature_collection.features += [out_of_state]

In [8]:
counties = [
            ('Ottawa', feature_collection['Ottawa']),
            ('Franklin', feature_collection['Franklin']),
           ]
cities = [
          ('Austin', (-97.75, 30.25)),
          ('Columbus', (-82.98, 39.98)),
          ('Port Clinton', (-82.9433, 41.5063)), # in ottawa county, which has islands
         ]

In [9]:
# for feature in feature_collection.features:
#     print feature.id, feature.bbox
# ottawa = feature_collection['Ottawa']
# ottawa.contains(-82.9433, 41.5063)

In [10]:
#parts = shape.parts.tolist() + [len(shape.points)]
#polygons = [shape.points[i:j] for i, j in zip(parts, parts[1:])]
print 'Testing county coverage...'
for county, feature in counties:
    for city, (lon, lat) in cities:
        print '{:s} contains {:s}? {:s}'.format(
            county, city, str(feature.contains(lon, lat)))


Testing county coverage...
Ottawa contains Austin? False
Ottawa contains Columbus? False
Ottawa contains Port Clinton? True
Franklin contains Austin? False
Franklin contains Columbus? True
Franklin contains Port Clinton? False

add counts for geolocated tweets


In [13]:
def location(document, feature_collection):
    lon = document.details.get('Longitude')
    lat = document.details.get('Latitude')
    features = list(feature_collection.features_containing(lon, lat))
    # country_match = countries.first_area_containing(lon, lat)
    in_state = len(features) == 1
    return features[0].id if in_state else 'Out-of-state'

for feature in feature_collection.features:
    feature.properties['tweets'] = dict(For=0, Against=0, Total=0)
    feature.properties['labeled_tweets'] = dict(For=0, Against=0, Total=0)

# geolocated_counties = np.array([location(document) for document in geolocated_corpus.data])
geolocated_pred_labels = geolocated_corpus.labels[geolocated_pred_y]
for pred_label, document in zip(geolocated_pred_labels, geolocated_corpus.data):
    feature_id = location(document, feature_collection)
    feature = feature_collection[feature_id]
    feature.properties['tweets'][pred_label] += 1
    feature.properties['tweets']['Total'] += 1
    if document.label in ['For', 'Against']:
        feature.properties['labeled_tweets'][pred_label] += 1
        feature.properties['labeled_tweets']['Total'] += 1

add population data


In [12]:
fips2county = dict((feature.properties['FIPS'], feature.id) for feature in feature_collection.features)
reader = csv.DictReader(open('/Users/chbrown/corpora/ohio/census/DataSet.txt'))
for row in reader:
    fips = row['fips']
    # POP010210 = Population, 2010
    population = row['POP010210']
    ohio_county = fips2county.get(fips)
    if ohio_county:
        county_feature = feature_collection[ohio_county]
        county_feature.properties['population'] = int(population)

def feature_population(feature):    
    return feature.properties.get('population', 0)
# for feature in sorted(feature_collection.features, key=feature_population):
#     print feature.id, feature_population(feature)
        
total_pop = sum(feature.properties.get('population', 0) for feature in feature_collection.features)
print total_pop, 'vs. Wikipedia\'s "2010 = 11536504"'


11536504 vs. Wikipedia's "2010 = 11536504"

add sb5 votes


In [20]:
reader = csv.DictReader(open('/Users/chbrown/corpora/ohio/ohio-secretaryofstate/issue2.tsv'), dialect='excel-tab')
for row in reader:
    feature_id = row['County']
    feature = feature_collection[feature_id]
    feature.properties['votes'] = dict(For=int(row['Yes']), Against=int(row['No']), Total=int(row['Yes']) + int(row['No']))

In [21]:
# geojson_path = os.path.join(root, 'static', 'oh-counties.geo.json')
geojson_encoder = geo.types.GeoEncoder(indent=None)
geojson_string = geojson_encoder.encode(feature_collection)
# with open(geojson_path, 'w') as geojson_fd:
#     geojson_fd.write()
topojson_path = os.path.join(root, 'static', 'oh-counties.topo.json')
from subprocess import PIPE
proc = subprocess.Popen(['topojson', '--properties', '--out', topojson_path],
                        stdin=PIPE, stdout=PIPE, stderr=PIPE)
stdout, stderr = proc.communicate(geojson_string)
print 'Wrote feature_collection to GeoJSON and converted to TopoJSON'
print stdout, stderr


Wrote feature_collection to GeoJSON and converted to TopoJSON
 bounds: -84.81993103027344 38.40250015258789 -80.51771545410156 41.978248596191406 (spherical)
pre-quantization: 0.478m (0.00000430°) 0.398m (0.00000358°)
topology: 262 arcs, 3840 points
post-quantization: 47.8m (0.000430°) 39.8m (0.000358°)
prune: retained 262 / 262 arcs (100%)


In [ ]: