In [1]:
import bson
import IPython.core.display
import IPython.display
import matplotlib
import monary
import numpy
import pandas
import pprint
import pymongo
import sklearn.linear_model
# Matplotlib on Mac OS X in a virtualenv doesn't work properly, so we have to force it to use the TkAgg backend.
matplotlib.use('TkAgg')
import matplotlib.pyplot
%matplotlib inline
DB_NAME = 'radio'
In [2]:
# Load up DB.
client = pymongo.MongoClient('localhost', 27017)
db = client[DB_NAME]
In [3]:
# Trying out the subjects DB (following the setup.md example).
subjects = db['radio_subjects']
subject = subjects.find_one()
pprint.pprint(subject)
IPython.display.Image(url=subject['location']['radio'])
Out[3]:
In [4]:
IPython.display.Image(url=subject['location']['standard'])
Out[4]:
In [5]:
# Trying out the classifications DB.
classifications = db['radio_classifications']
for classification in classifications.find():
# Get a classification that has annotations.
annotations = classification['annotations'][0]
if 'ir' in annotations:
break
print(classification['annotations'][0]['ir']['0'])
print(classification['annotations'][0]['radio']['0'])
sid = classification['subject_ids'][0]
pprint.pprint(classification)
In [6]:
# Trying out Monary.
with monary.Monary() as mon:
columns = ['classification_count', 'state']
subjects_data = mon.query(DB_NAME, 'radio_subjects', {},
columns, ['uint8', 'string:20'])
subjects_dataframe = pandas.DataFrame(numpy.ma.filled(subjects_data).T, columns=columns)
subjects_dataframe.head()
# One thing I can't figure out is how to load an entire nested bson doc, e.g. the entire location property.
# I also can't figure out how to query by ID. I know that I can specify properties in find_one and also in the
# monary query, but this doesn't seem to work for ID (possibly due to its type?).
Out[6]:
To demonstrate basic logistic regression, I will map classification count to whether or not subjects are complete.
In [7]:
# Get the data into types we can use.
subjects_dataframe['complete'] = subjects_dataframe['state'] == b'complete'
subjects_dataframe['classification_count'] = subjects_dataframe['classification_count'].astype(int)
subjects_dataframe.head()
Out[7]:
In [8]:
# Let's look at the data to see how separable it is.
subjects_dataframe = subjects_dataframe.sort_values(by='classification_count')
samples = 150000
features = numpy.vstack([subjects_dataframe['classification_count'][:samples],
numpy.ones(samples)]).T
targets = subjects_dataframe['complete'][:samples].astype(float)
matplotlib.pyplot.scatter(features[:, 0], targets)
matplotlib.pyplot.title('Classification Count vs Complete')
matplotlib.pyplot.xlabel('Classification Count')
matplotlib.pyplot.ylabel('Complete?')
matplotlib.pyplot.show()
In [9]:
# Not very linearly separable/separable at all. Let's try separating it anyway.
lr = sklearn.linear_model.LogisticRegression()
lr.fit(features, targets)
predictions = lr.predict(features)
incomplete_features = features[predictions < 0.5]
complete_features = features[predictions > 0.5]
incomplete_predictions = targets[predictions < 0.5]
complete_predictions = targets[predictions > 0.5]
matplotlib.pyplot.scatter(incomplete_features[:, 0], incomplete_predictions, c='r', marker='+')
matplotlib.pyplot.scatter(complete_features[:, 0], complete_predictions, c='g', marker='x')
matplotlib.pyplot.title('Predicted Completeness from Classification Count')
matplotlib.pyplot.xlabel('Classification Count')
matplotlib.pyplot.ylabel('(True) Complete?')
matplotlib.pyplot.legend(['Incomplete', 'Complete'])
matplotlib.pyplot.show()
That seems a sensible result.
In [ ]: