In [1]:
from sqlalchemy import create_engine, MetaData, Table, inspect
from sqlalchemy.sql import select
from sqlalchemy.orm import sessionmaker
from matplotlib import pyplot as plt
from scipy.stats import mode
import pandas as pd
from unidecode import unidecode
from lib.alchemy.schema import GrantBase, ApplicationBase
from lib.alchemy import session_generator
sessiongen = session_generator(dbtype='grant')
session = sessiongen()

In [10]:
def printstats(series):
    print 'mean',series.mean()
    print 'median',series.median()
    print 'mode',mode(series)[0][0]
    print 'std',series.std()
    print 'min',series.min()
    print 'max',series.max()

Table Counts


In [12]:
counts =[]
tablekeys = []
tables = GrantBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
    res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
    if res:
        counts.append(res)
        tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')


Out[12]:
<matplotlib.text.Text at 0x10b503610>

In [4]:
patent_count = session.execute('select count(*) from patent;').fetchone()[0]
app_count = session.execute('select count(*) from application;').fetchone()[0]

rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]

rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]

rawlawyer_count = session.execute('select count(*) from rawlawyer;').fetchone()[0]
disambiglawyer_count = session.execute('select count(*) from lawyer;').fetchone()[0]

rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]

d = pd.DataFrame.from_dict({'raw': [patent_count,'',rawinventor_count,rawassignee_count,rawlawyer_count,rawlocation_count],
                           'disambig': ['', app_count, disambiginventor_count, disambigassignee_count, disambiglawyer_count, disambiglocation_count],
                           'labels': ['patent','application','inventor','assignee','lawyer','location']})
d[['labels','raw','disambig']]


Out[4]:
labels raw disambig
0 patent 5072245
1 application 5002322
2 inventor 11546565 3780422
3 assignee 6738097 368684
4 lawyer 3648399 121268
5 location 403853 69640

Inventor

Raw Inventor


In [13]:
# inventors per patent
res = session.execute('select count(*) from rawinventor group by patent_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Patent Count')
h.set_title('Inventors per Patent')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]


mean 2.27642144187
median 2.0
mode 1.0
std 1.67215348567
min 1
max 76
Total: 11546565

In [14]:
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from patent_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Patents')
h.set_ylabel('Inventors')
h.set_title('Patents per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]


mean 3.01963405144
median 1.0
mode 1.0
std 7.50657935128
min 1
max 4582
Total: 11546565

Location

RawLocation


In [15]:
res = session.execute('select location.state, count(*) from rawinventor \
                       left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
                       left join location on location.id = rawlocation.location_id \
                       where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'


mean 106914.854545
median 41123.0
mode 7.0
std 187392.226124
min 7
max 1244222
55 identified states

Location


In [16]:
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
                       where rawlocation.country = "US" and length(rawlocation.state) = 2 \
                       group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
                       where location.country = "US" and length(location.state) = 2 \
                       group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'


Raw Locations
mean 983.229508197
median 621.0
mode 1.0
std 1165.26165007
min 1
max 5809
59977 raw total cities
Disambiguated Locations:
mean 284.018181818
median 195.0
mode 1.0
std 263.929809055
min 1
max 1073
15621 total cities

Patents per State


In [17]:
# patents per state
res = session.execute("select location.state, count(*) from patent \
                       left join rawinventor on rawinventor.patent_id = patent.id \
                       left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                       right join location on location.id = rawlocation.location_id \
                       where length(location.state) = 2 and rawinventor.sequence = 0 \
                       and location.country = 'US' \
                       group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Patents')
h.set_title('Patents per State')
printstats(d['count'])
print sum(d['count'])


mean 18535.2222222
median 7598.0
mode 1.0
std 36240.9846594
min 1
max 252941
1000902

Patents


In [18]:
res = session.execute('select year(date), count(*) from patent group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Grant Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Grant year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'


mean 130057.564103
median 101419.0
mode 48854.0
std 67968.1111903
min 48854
max 303478
5072245 total patents

Granted Applications


In [19]:
res = session.execute('select year(application.date), count(*) from patent \
                       left join application on application.patent_id = patent.id \
                       where year(application.date) != "" \
                       group by year(application.date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Application year')
printstats(d['counts'])
print sum(d['counts']), 'total granted applications'


mean 54373.0326087
median 283.5
mode 1.0
std 74219.9389947
min 1
max 228149
5002319 total granted applications

Raw Assignees


In [20]:
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Patents')
h.set_title('Patents per RawAssignee')


mean 16.016432176
median 2.0
mode 1.0
std 1077.23710111
min 1
max 661694
Out[20]:
<matplotlib.text.Text at 0x1642918d0>

Raw Lawyers


In [21]:
# patents per lawyer
res = session.execute('select count(*) from rawlawyer group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawLawyers')
h.set_ylabel('Patents')
h.set_title('Patents per RawLawyer')


mean 70.0483641809
median 1.0
mode 1.0
std 5569.59634035
min 1
max 1266613
Out[21]:
<matplotlib.text.Text at 0x14f9e8a50>

Citations

US Patent Citation


In [22]:
res = session.execute('select count(*) from uspatentcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from uspatentcitation group by citation_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])


Forward Citation Stats
mean 12.7942816581
median 7.0
mode 4.0
std 31.1585783602
min 1
max 3036
Backward Citation Stats
mean 10.5262394974
median 4.0
mode 1.0
std 22.0138044268
min 1
max 2910

Foreign Citation


In [23]:
res = session.execute('select count(*) from foreigncitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from foreigncitation group by number;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])


Forward Citation Stats
mean 7.96402179982
median 4.0
mode 1.0
std 17.5793650342
min 1
max 954
Backward Citation Stats
mean 2.38540606228
median 1.0
mode 1.0
std 5.55979417389
min 1
max 814

Other References


In [24]:
res = session.execute('select count(*) from otherreference group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])


Forward Citation Stats
mean 13.6768728825
median 3.0
mode 1.0
std 43.0066034322
min 1
max 2964

Application Citation


In [25]:
res = session.execute('select count(*) from usapplicationcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])


Forward Citation Stats
mean 8.10105988436
median 3.0
mode 1.0
std 22.1858136802
min 1
max 1168

Application Data


In [2]:
sessiongen = session_generator(dbtype='application')
session = sessiongen()

Table Counts


In [3]:
counts =[]
tablekeys = []
tables = ApplicationBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
    res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
    if res:
        counts.append(res)
        tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')


Out[3]:
<matplotlib.text.Text at 0x114452110>

In [8]:
patent_count = session.execute('select count(*) from application;').fetchone()[0]

rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]

rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]

rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]

d = pd.DataFrame.from_dict({'raw': [patent_count,rawinventor_count,rawassignee_count,rawlocation_count],
                           'disambig': ['N/A',disambiginventor_count, disambigassignee_count, disambiglocation_count],
                           'labels': ['application','inventor','assignee','location']})
d[['labels','raw','disambig']]


Out[8]:
labels raw disambig
0 application 3422423 N/A
1 inventor 9009611 4392105
2 assignee 1925136 368685
3 location 222423 121331

Inventor


In [11]:
# inventors per patent
res = session.execute('select count(*) from rawinventor group by application_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Application Count')
h.set_title('Inventors per Application')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]


mean 2.63268640283
median 2.0
mode 1.0
std 1.93344233411
min 1
max 100
Total: 9009611

In [12]:
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from application_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Applications')
h.set_ylabel('Inventors')
h.set_title('Applications per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]


mean 1.01828553734
median 1.0
mode 1.0
std 0.822763437723
min 1
max 990
Total: 9009611

Location

RawLocation


In [13]:
res = session.execute('select location.state, count(*) from rawinventor \
                       left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
                       left join location on location.id = rawlocation.location_id \
                       where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'


mean 72594.862069
median 24226.5
mode 15.0
std 150918.82239
min 1
max 1074626
58 identified states

Disambiguated Location


In [14]:
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
                       where rawlocation.country = "US" and length(rawlocation.state) = 2 \
                       group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
                       where location.country = "US" and length(location.state) = 2 \
                       group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'


Raw Locations
mean 390.46875
median 5.0
mode 1.0
std 813.071051292
min 1
max 5636
49980 raw total cities
Disambiguated Locations:
mean 424.93220339
median 308.0
mode 2.0
std 412.873649555
min 1
max 1724
25071 total cities

In [15]:
# applications per state
res = session.execute("select location.state, count(*) from application \
                       left join rawinventor on rawinventor.application_id = application.id \
                       left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                       right join location on location.id = rawlocation.location_id \
                       where length(location.state) = 2 and rawinventor.sequence = 0 \
                       and location.country = 'US' \
                       group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Applications')
h.set_title('Applications per State')
printstats(d['count'])
print sum(d['count'])


mean 28035.3508772
median 10315.0
mode 1.0
std 56035.2411388
min 1
max 396320
1598015

Applications


In [16]:
res = session.execute('select year(date), count(*) from application group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Application Count')
h.set_title('Applications by year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'


mean 171121.15
median 159806.5
mode 1.0
std 157077.904978
min 1
max 404099
3422423 total patents

Assignees


In [17]:
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Applications')
h.set_title('Applications per RawAssignee')


mean 8.78443826915
median 1.0
mode 1.0
std 216.518254708
min 1
max 51561
Out[17]:
<matplotlib.text.Text at 0x13e535210>

In [ ]: