In [1]:
from sqlalchemy import create_engine, MetaData, Table, inspect
from sqlalchemy.sql import select
from sqlalchemy.orm import sessionmaker
from matplotlib import pyplot as plt
from scipy.stats import mode
import pandas as pd
from unidecode import unidecode
from lib.alchemy.schema import GrantBase, ApplicationBase
from lib.alchemy import session_generator
sessiongen = session_generator(dbtype='grant')
session = sessiongen()
In [10]:
def printstats(series):
print 'mean',series.mean()
print 'median',series.median()
print 'mode',mode(series)[0][0]
print 'std',series.std()
print 'min',series.min()
print 'max',series.max()
In [12]:
counts =[]
tablekeys = []
tables = GrantBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
if res:
counts.append(res)
tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')
Out[12]:
In [4]:
patent_count = session.execute('select count(*) from patent;').fetchone()[0]
app_count = session.execute('select count(*) from application;').fetchone()[0]
rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]
rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]
rawlawyer_count = session.execute('select count(*) from rawlawyer;').fetchone()[0]
disambiglawyer_count = session.execute('select count(*) from lawyer;').fetchone()[0]
rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]
d = pd.DataFrame.from_dict({'raw': [patent_count,'',rawinventor_count,rawassignee_count,rawlawyer_count,rawlocation_count],
'disambig': ['', app_count, disambiginventor_count, disambigassignee_count, disambiglawyer_count, disambiglocation_count],
'labels': ['patent','application','inventor','assignee','lawyer','location']})
d[['labels','raw','disambig']]
Out[4]:
In [13]:
# inventors per patent
res = session.execute('select count(*) from rawinventor group by patent_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Patent Count')
h.set_title('Inventors per Patent')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]
In [14]:
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from patent_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Patents')
h.set_ylabel('Inventors')
h.set_title('Patents per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]
In [15]:
res = session.execute('select location.state, count(*) from rawinventor \
left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
left join location on location.id = rawlocation.location_id \
where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'
In [16]:
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
where rawlocation.country = "US" and length(rawlocation.state) = 2 \
group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
where location.country = "US" and length(location.state) = 2 \
group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'
In [17]:
# patents per state
res = session.execute("select location.state, count(*) from patent \
left join rawinventor on rawinventor.patent_id = patent.id \
left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
right join location on location.id = rawlocation.location_id \
where length(location.state) = 2 and rawinventor.sequence = 0 \
and location.country = 'US' \
group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Patents')
h.set_title('Patents per State')
printstats(d['count'])
print sum(d['count'])
In [18]:
res = session.execute('select year(date), count(*) from patent group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Grant Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Grant year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'
In [19]:
res = session.execute('select year(application.date), count(*) from patent \
left join application on application.patent_id = patent.id \
where year(application.date) != "" \
group by year(application.date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Application year')
printstats(d['counts'])
print sum(d['counts']), 'total granted applications'
In [20]:
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Patents')
h.set_title('Patents per RawAssignee')
Out[20]:
In [21]:
# patents per lawyer
res = session.execute('select count(*) from rawlawyer group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawLawyers')
h.set_ylabel('Patents')
h.set_title('Patents per RawLawyer')
Out[21]:
In [22]:
res = session.execute('select count(*) from uspatentcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from uspatentcitation group by citation_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])
In [23]:
res = session.execute('select count(*) from foreigncitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from foreigncitation group by number;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])
In [24]:
res = session.execute('select count(*) from otherreference group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
In [25]:
res = session.execute('select count(*) from usapplicationcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
In [2]:
sessiongen = session_generator(dbtype='application')
session = sessiongen()
In [3]:
counts =[]
tablekeys = []
tables = ApplicationBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
if res:
counts.append(res)
tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')
Out[3]:
In [8]:
patent_count = session.execute('select count(*) from application;').fetchone()[0]
rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]
rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]
rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]
d = pd.DataFrame.from_dict({'raw': [patent_count,rawinventor_count,rawassignee_count,rawlocation_count],
'disambig': ['N/A',disambiginventor_count, disambigassignee_count, disambiglocation_count],
'labels': ['application','inventor','assignee','location']})
d[['labels','raw','disambig']]
Out[8]:
In [11]:
# inventors per patent
res = session.execute('select count(*) from rawinventor group by application_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Application Count')
h.set_title('Inventors per Application')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]
In [12]:
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from application_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Applications')
h.set_ylabel('Inventors')
h.set_title('Applications per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]
In [13]:
res = session.execute('select location.state, count(*) from rawinventor \
left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
left join location on location.id = rawlocation.location_id \
where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'
In [14]:
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
where rawlocation.country = "US" and length(rawlocation.state) = 2 \
group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
where location.country = "US" and length(location.state) = 2 \
group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'
In [15]:
# applications per state
res = session.execute("select location.state, count(*) from application \
left join rawinventor on rawinventor.application_id = application.id \
left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
right join location on location.id = rawlocation.location_id \
where length(location.state) = 2 and rawinventor.sequence = 0 \
and location.country = 'US' \
group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Applications')
h.set_title('Applications per State')
printstats(d['count'])
print sum(d['count'])
In [16]:
res = session.execute('select year(date), count(*) from application group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Application Count')
h.set_title('Applications by year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'
In [17]:
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Applications')
h.set_title('Applications per RawAssignee')
Out[17]:
In [ ]: