In [1]:

    
from sqlalchemy import create_engine, MetaData, Table, inspect
from sqlalchemy.sql import select
from sqlalchemy.orm import sessionmaker
from matplotlib import pyplot as plt
from scipy.stats import mode
import pandas as pd
from unidecode import unidecode
from lib.alchemy.schema import GrantBase, ApplicationBase
from lib.alchemy import session_generator
sessiongen = session_generator(dbtype='grant')
session = sessiongen()



In [10]:

    
def printstats(series):
    print 'mean',series.mean()
    print 'median',series.median()
    print 'mode',mode(series)[0][0]
    print 'std',series.std()
    print 'min',series.min()
    print 'max',series.max()

Table Counts



In [12]:

    
counts =[]
tablekeys = []
tables = GrantBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
    res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
    if res:
        counts.append(res)
        tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')









    Out[12]:





<matplotlib.text.Text at 0x10b503610>



In [4]:

    
patent_count = session.execute('select count(*) from patent;').fetchone()[0]
app_count = session.execute('select count(*) from application;').fetchone()[0]

rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]

rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]

rawlawyer_count = session.execute('select count(*) from rawlawyer;').fetchone()[0]
disambiglawyer_count = session.execute('select count(*) from lawyer;').fetchone()[0]

rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]

d = pd.DataFrame.from_dict({'raw': [patent_count,'',rawinventor_count,rawassignee_count,rawlawyer_count,rawlocation_count],
                           'disambig': ['', app_count, disambiginventor_count, disambigassignee_count, disambiglawyer_count, disambiglocation_count],
                           'labels': ['patent','application','inventor','assignee','lawyer','location']})
d[['labels','raw','disambig']]









    Out[4]:






  
    
      
      labels
      raw
      disambig
    
  
  
    
      0
            patent
        5072245
              
    
    
      1
       application
               
       5002322
    
    
      2
          inventor
       11546565
       3780422
    
    
      3
          assignee
        6738097
        368684
    
    
      4
            lawyer
        3648399
        121268
    
    
      5
          location
         403853
         69640

Inventor

Raw Inventor



In [13]:

    
# inventors per patent
res = session.execute('select count(*) from rawinventor group by patent_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Patent Count')
h.set_title('Inventors per Patent')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]









    



mean 2.27642144187
median 2.0
mode 1.0
std 1.67215348567
min 1
max 76
Total: 11546565



In [14]:

    
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from patent_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Patents')
h.set_ylabel('Inventors')
h.set_title('Patents per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]









    



mean 3.01963405144
median 1.0
mode 1.0
std 7.50657935128
min 1
max 4582
Total: 11546565

Location

RawLocation



In [15]:

    
res = session.execute('select location.state, count(*) from rawinventor \
                       left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
                       left join location on location.id = rawlocation.location_id \
                       where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'









    



mean 106914.854545
median 41123.0
mode 7.0
std 187392.226124
min 7
max 1244222
55 identified states

Location



In [16]:

    
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
                       where rawlocation.country = "US" and length(rawlocation.state) = 2 \
                       group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
                       where location.country = "US" and length(location.state) = 2 \
                       group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'









    



Raw Locations
mean 983.229508197
median 621.0
mode 1.0
std 1165.26165007
min 1
max 5809
59977 raw total cities
Disambiguated Locations:
mean 284.018181818
median 195.0
mode 1.0
std 263.929809055
min 1
max 1073
15621 total cities

Patents per State



In [17]:

    
# patents per state
res = session.execute("select location.state, count(*) from patent \
                       left join rawinventor on rawinventor.patent_id = patent.id \
                       left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                       right join location on location.id = rawlocation.location_id \
                       where length(location.state) = 2 and rawinventor.sequence = 0 \
                       and location.country = 'US' \
                       group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Patents')
h.set_title('Patents per State')
printstats(d['count'])
print sum(d['count'])









    



mean 18535.2222222
median 7598.0
mode 1.0
std 36240.9846594
min 1
max 252941
1000902

Patents



In [18]:

    
res = session.execute('select year(date), count(*) from patent group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Grant Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Grant year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'









    



mean 130057.564103
median 101419.0
mode 48854.0
std 67968.1111903
min 48854
max 303478
5072245 total patents

Granted Applications



In [19]:

    
res = session.execute('select year(application.date), count(*) from patent \
                       left join application on application.patent_id = patent.id \
                       where year(application.date) != "" \
                       group by year(application.date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Patent Count')
h.set_title('Granted Patents by Application year')
printstats(d['counts'])
print sum(d['counts']), 'total granted applications'









    



mean 54373.0326087
median 283.5
mode 1.0
std 74219.9389947
min 1
max 228149
5002319 total granted applications

Raw Assignees



In [20]:

    
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Patents')
h.set_title('Patents per RawAssignee')









    



mean 16.016432176
median 2.0
mode 1.0
std 1077.23710111
min 1
max 661694






    Out[20]:





<matplotlib.text.Text at 0x1642918d0>

Raw Lawyers



In [21]:

    
# patents per lawyer
res = session.execute('select count(*) from rawlawyer group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawLawyers')
h.set_ylabel('Patents')
h.set_title('Patents per RawLawyer')









    



mean 70.0483641809
median 1.0
mode 1.0
std 5569.59634035
min 1
max 1266613






    Out[21]:





<matplotlib.text.Text at 0x14f9e8a50>

Citations

US Patent Citation



In [22]:

    
res = session.execute('select count(*) from uspatentcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from uspatentcitation group by citation_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])









    



Forward Citation Stats
mean 12.7942816581
median 7.0
mode 4.0
std 31.1585783602
min 1
max 3036
Backward Citation Stats
mean 10.5262394974
median 4.0
mode 1.0
std 22.0138044268
min 1
max 2910

Foreign Citation



In [23]:

    
res = session.execute('select count(*) from foreigncitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])
res = session.execute('select count(*) from foreigncitation group by number;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Backward Citation Stats"
printstats(d['counts'])









    



Forward Citation Stats
mean 7.96402179982
median 4.0
mode 1.0
std 17.5793650342
min 1
max 954
Backward Citation Stats
mean 2.38540606228
median 1.0
mode 1.0
std 5.55979417389
min 1
max 814

Other References



In [24]:

    
res = session.execute('select count(*) from otherreference group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])









    



Forward Citation Stats
mean 13.6768728825
median 3.0
mode 1.0
std 43.0066034322
min 1
max 2964

Application Citation



In [25]:

    
res = session.execute('select count(*) from usapplicationcitation group by patent_id;')
cit_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': cit_counts})
print "Forward Citation Stats"
printstats(d['counts'])









    



Forward Citation Stats
mean 8.10105988436
median 3.0
mode 1.0
std 22.1858136802
min 1
max 1168

Application Data



In [2]:

    
sessiongen = session_generator(dbtype='application')
session = sessiongen()

Table Counts



In [3]:

    
counts =[]
tablekeys = []
tables = ApplicationBase.metadata.tables
rawtables = tables.keys()
for table in rawtables:
    res = session.execute('select count(*) from {0}'.format(table)).fetchone()[0]
    if res:
        counts.append(res)
        tablekeys.append(table)
d = pd.DataFrame.from_dict({'tables': tablekeys, 'counts': map(lambda x: int(x), counts)})
d.index = d['tables']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('Table')
h.set_ylabel('Record Count')









    Out[3]:





<matplotlib.text.Text at 0x114452110>



In [8]:

    
patent_count = session.execute('select count(*) from application;').fetchone()[0]

rawinventor_count = session.execute('select count(*) from rawinventor;').fetchone()[0]
disambiginventor_count = session.execute('select count(*) from inventor;').fetchone()[0]

rawassignee_count = session.execute('select count(*) from rawassignee;').fetchone()[0]
disambigassignee_count = session.execute('select count(*) from assignee;').fetchone()[0]

rawlocation_count = session.execute('select count(*) from rawlocation;').fetchone()[0]
disambiglocation_count = session.execute('select count(*) from location;').fetchone()[0]

d = pd.DataFrame.from_dict({'raw': [patent_count,rawinventor_count,rawassignee_count,rawlocation_count],
                           'disambig': ['N/A',disambiginventor_count, disambigassignee_count, disambiglocation_count],
                           'labels': ['application','inventor','assignee','location']})
d[['labels','raw','disambig']]









    Out[8]:






  
    
      
      labels
      raw
      disambig
    
  
  
    
      0
       application
       3422423
           N/A
    
    
      1
          inventor
       9009611
       4392105
    
    
      2
          assignee
       1925136
        368685
    
    
      3
          location
        222423
        121331

Inventor



In [11]:

    
# inventors per patent
res = session.execute('select count(*) from rawinventor group by application_id;')
inventor_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': inventor_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Number of Inventors')
h.set_ylabel('Application Count')
h.set_title('Inventors per Application')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]









    



mean 2.63268640283
median 2.0
mode 1.0
std 1.93344233411
min 1
max 100
Total: 9009611



In [12]:

    
# patents per inventor
session = sessiongen()
res = session.execute('select count(*) from application_inventor group by inventor_id;')
patent_counts = [x[0] for x in res.fetchall()]
d = pd.DataFrame.from_dict({'counts': patent_counts})
h = d[d['counts'] < 20].hist(bins=20, figsize=(16,10))[0][0]
h.set_xlabel('Applications')
h.set_ylabel('Inventors')
h.set_title('Applications per Inventor')
printstats(d['counts'])
print 'Total:', session.execute('select count(*) from rawinventor;').fetchone()[0]









    



mean 1.01828553734
median 1.0
mode 1.0
std 0.822763437723
min 1
max 990
Total: 9009611

Location

RawLocation



In [13]:

    
res = session.execute('select location.state, count(*) from rawinventor \
                       left join rawlocation on rawinventor.rawlocation_id = rawlocation.id \
                       left join location on location.id = rawlocation.location_id \
                       where location.country = "US" and length(location.state) = 2 group by location.state ')
data = res.fetchall()
inventor_counts = [int(x[1]) for x in data]
inventor_states = [unidecode(x[0]) for x in data]
d = pd.DataFrame.from_dict({'counts': inventor_counts, 'states': inventor_states})
d.index = d['states']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. States')
h.set_ylabel('Raw Inventors')
h.set_title('Raw Inventors per Reported State')
printstats(d['counts'])
print len(d['states']), 'identified states'









    



mean 72594.862069
median 24226.5
mode 15.0
std 150918.82239
min 1
max 1074626
58 identified states

Disambiguated Location



In [14]:

    
# cities by state
res = session.execute('select rawlocation.state, count(distinct rawlocation.city) from rawlocation \
                       where rawlocation.country = "US" and length(rawlocation.state) = 2 \
                       group by rawlocation.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (raw)')
print "Raw Locations"
printstats(d['count'])
print sum(d['count']), 'raw total cities'
# disambiguated
res = session.execute('select location.state, count(distinct location.city) from location \
                       where location.country = "US" and length(location.state) = 2 \
                       group by location.state')
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d[['count','state']].plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Number of Unique Cities')
h.set_title('Cities per State (disambig)')
print "Disambiguated Locations:"
printstats(d['count'])
print sum(d['count']), 'total cities'









    



Raw Locations
mean 390.46875
median 5.0
mode 1.0
std 813.071051292
min 1
max 5636
49980 raw total cities
Disambiguated Locations:
mean 424.93220339
median 308.0
mode 2.0
std 412.873649555
min 1
max 1724
25071 total cities



In [15]:

    
# applications per state
res = session.execute("select location.state, count(*) from application \
                       left join rawinventor on rawinventor.application_id = application.id \
                       left join rawlocation on rawlocation.id = rawinventor.rawlocation_id \
                       right join location on location.id = rawlocation.location_id \
                       where length(location.state) = 2 and rawinventor.sequence = 0 \
                       and location.country = 'US' \
                       group by location.state")
data = res.fetchall()
d = pd.DataFrame.from_records(data)
d.columns = ['state','count']
d.index = d['state']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('U.S. State')
h.set_ylabel('Applications')
h.set_title('Applications per State')
printstats(d['count'])
print sum(d['count'])









    



mean 28035.3508772
median 10315.0
mode 1.0
std 56035.2411388
min 1
max 396320
1598015

Applications



In [16]:

    
res = session.execute('select year(date), count(*) from application group by year(date);')
year_counts = map(lambda x: (str(int(x[0])), int(x[1])), res.fetchall())
d = pd.DataFrame.from_dict({'dates': [x[0] for x in year_counts], 'counts': [x[1] for x in year_counts]})
d.index = d['dates']
h = d.plot(kind='bar',figsize=(16,10))
h.set_xlabel('App Year')
h.set_ylabel('Application Count')
h.set_title('Applications by year')
printstats(d['counts'])
print sum(d['counts']), 'total patents'









    



mean 171121.15
median 159806.5
mode 1.0
std 157077.904978
min 1
max 404099
3422423 total patents

Assignees



In [17]:

    
# patents per assignee
res = session.execute('select count(*) from rawassignee group by organization;')
data = res.fetchall()
d = pd.DataFrame.from_dict({'count': [int(x[0]) for x in data]})
printstats(d['count'])
h = d[d['count'] <= 20].hist(bins=20)[0][0]
h.set_xlabel('RawAssignees')
h.set_ylabel('Applications')
h.set_title('Applications per RawAssignee')









    



mean 8.78443826915
median 1.0
mode 1.0
std 216.518254708
min 1
max 51561






    Out[17]:





<matplotlib.text.Text at 0x13e535210>



In [ ]:

	labels	raw	disambig
0	patent	5072245
1	application		5002322
2	inventor	11546565	3780422
3	assignee	6738097	368684
4	lawyer	3648399	121268
5	location	403853	69640

	labels	raw	disambig
0	application	3422423	N/A
1	inventor	9009611	4392105
2	assignee	1925136	368685
3	location	222423	121331