Patent Diffusion Analysis


In [1]:
import re
import csv
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import datetime
difPath = 'd:/diffusion_data'

In [3]:
# import abstracts
bioAbs = pd.DataFrame.from_csv(difPath+'/bio_abstracts_prep.csv')
aiAbs = pd.DataFrame.from_csv(difPath+'/ai_abstracts_prep.csv')
# import patents
bioPats = pd.DataFrame.from_csv(difPath+'/bio_patents_prep.csv')
aiPats = pd.DataFrame.from_csv(difPath+'/ai_patents_prep.csv')
# import similarity data
bioSim = pd.DataFrame.from_csv(difPath+'/bio_similarity_prep.csv')
aiSim = pd.DataFrame.from_csv(difPath+'/ai_similarity_prep.csv')

Table 1 Patent and paper abstract descriptive statistics


In [6]:
print 'num ai abstracts:',len(aiAbs)
print 'ai abstract year range:',min(aiAbs.JRefYear),max(aiAbs.JRefYear)
print 'ai abstract length stats:\n',aiAbs.AbsLen.describe()

print '\n\n'

print 'num ai patents:',len(aiPats)
print 'ai pat year range:',min(aiPats.gyear),max(aiPats.gyear)
print 'ai pat abstract length stats:\n',aiPats.AbsLen.describe()

print '\n\n'

print 'num bio abstracts:',len(bioAbs)
print 'bio abstract year range:',min(bioAbs.JRefYear),max(bioAbs.JRefYear)
print 'bio abstract length stats:\n',bioAbs.AbsLen.describe()

print '\n\n'

print 'num bio patents:',len(bioPats)
print 'bio pat year range:',min(bioPats.gyear),max(bioPats.gyear)
print 'bio pat abstract length stats:\n',bioPats.AbsLen.describe()


num ai abstracts: 2641
ai abstract year range: 1993 2013
ai abstract length stats:
count    2641.000000
mean     1006.599773
std       385.949121
min        28.000000
25%       720.000000
50%       974.000000
75%      1247.000000
max      3124.000000
dtype: float64



num ai patents: 83
ai pat year range: 1996 1999
ai pat abstract length stats:
count      83.000000
mean     1042.867470
std       288.191534
min       327.000000
25%       832.500000
50%      1041.000000
75%      1222.000000
max      1667.000000
dtype: float64



num bio abstracts: 1854
bio abstract year range: 1983 2013
bio abstract length stats:
count    1854.000000
mean      967.619202
std       270.508664
min       236.000000
25%       773.000000
50%       977.000000
75%      1134.000000
max      2177.000000
dtype: float64



num bio patents: 150
bio pat year range: 1996 1999
bio pat abstract length stats:
count     150.000000
mean      770.080000
std       353.298883
min       119.000000
25%       500.000000
50%       694.000000
75%      1030.000000
max      1974.000000
dtype: float64

Figure 19 Bernard et al. (2003) similarity regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.


In [7]:
testSim = pd.DataFrame.from_csv(difPath+'/simTC_simRegRaw.csv')
relDumYears = list(testSim.RelDumYear)

# create lists of values to iterate over when plotting
pVal = list(testSim.DPVal)
coef = list(testSim.DCoef)

# plot test case regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(pVal)):
    if pVal [i]<=0.1:
        ax1.scatter(relDumYears[i],pVal[i], marker='o', c='b',s=40)
    else:
        ax1.scatter(relDumYears[i],pVal[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Matching words and pictures\nOLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()

ax2 = fig.add_subplot(122)
for i in range(len(coef)):
    if pVal[i]<=0.1:
        ax2.scatter(relDumYears[i],coef[i], marker='o', c='b',s=40)
    else:
        ax2.scatter(relDumYears[i],coef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Matching words and pictures\nOLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()


Figure 20 Histogram biotech similarity Histograms of similarity scores for biotech before (left) and after (right) patent publication. The solid red bar shows the mean similarity for the histogram.


In [8]:
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
ax1.hist(bioSim[bioSim.RelYear<=0].Similarity,bins=50)
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Similarity')
ax1.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures Before Patent Publication')
meanBefore = bioSim[bioSim.RelYear<=0].Similarity.mean()
ax1.axvline(x=meanBefore, color='r', ls='-', lw=2)
ax1.set_xlim(0,bioSim.Similarity.max()*1.1)
ax1.grid()

ax2 = fig.add_subplot(122)
ax2.hist(bioSim[bioSim.RelYear>0].Similarity,bins=50)
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Similarity')
ax2.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures After Patent Publication')
meanAfter = bioSim[bioSim.RelYear>0].Similarity.mean()
ax2.axvline(x=meanAfter, color='r', ls='-', lw=2)
ax2.set_xlim(0,bioSim.Similarity.max()*1.1)
ax2.grid()
plt.show()


Figure 21 Histogram AI similarity Histograms of similarity scores for AI before (left) and after (right) patent publication. The solid red bar shows the mean similarity for the histogram.


In [9]:
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
ax1.hist(aiSim[aiSim.RelYear<=0].Similarity,bins=50)
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Similarity')
ax1.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures Before Patent Publication')
meanBefore = aiSim[aiSim.RelYear<=0].Similarity.mean()
ax1.axvline(x=meanBefore, color='r', ls='-', lw=2)
ax1.set_xlim(0,aiSim.Similarity.max()*1.1)
ax1.grid()

ax2 = fig.add_subplot(122)
ax2.hist(aiSim[aiSim.RelYear>0].Similarity,bins=50)
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Similarity')
ax2.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures After Patent Publication')
meanAfter = aiSim[aiSim.RelYear>0].Similarity.mean()
ax2.axvline(x=meanAfter, color='r', ls='-', lw=2)
ax2.set_xlim(0,aiSim.Similarity.max()*1.1)
ax2.grid()
plt.show()


Table 2 Similarity measures descriptive statistics


In [10]:
print 'before patent publication\n'
print 'num bio sim calcs:',len(bioSim[bioSim.RelYear<0])
print 'bio sim stats:\n',bioSim[bioSim.RelYear<0].Similarity.describe()
print 'num ai sim calcs:',len(aiSim[aiSim.RelYear<0])
print 'ai sim stats:\n',aiSim[aiSim.RelYear<0].Similarity.describe()

print '\n\n'

print 'on/after patent publication\n'
print 'num bio sim calcs:',len(bioSim[bioSim.RelYear>=0])
print 'bio sim stats:\n',bioSim[bioSim.RelYear>=0].Similarity.describe()
print 'num ai sim calcs:',len(aiSim[aiSim.RelYear>=0])
print 'ai sim stats:\n',aiSim[aiSim.RelYear>=0].Similarity.describe()


before patent publication

num bio sim calcs: 137104
bio sim stats:
count    137104.000000
mean          0.159666
std           0.028950
min           0.068588
25%           0.139833
50%           0.157116
75%           0.176492
max           0.356964
dtype: float64
num ai sim calcs: 10249
ai sim stats:
count    10249.000000
mean         0.175014
std          0.030381
min          0.078741
25%          0.153589
50%          0.172717
75%          0.194729
max          0.315691
dtype: float64



on/after patent publication

num bio sim calcs: 140996
bio sim stats:
count    140996.000000
mean          0.163146
std           0.028607
min           0.078734
25%           0.143630
50%           0.160788
75%           0.179650
max           0.427838
dtype: float64
num ai sim calcs: 208954
ai sim stats:
count    208954.000000
mean          0.173958
std           0.030326
min           0.040666
25%           0.152989
50%           0.172035
75%           0.192759
max           0.361769
dtype: float64

Figure 22 Biotech regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.


In [11]:
bioSimRegs = pd.DataFrame.from_csv(difPath+'/bio_simRegMeans.csv')
relDumYears = list(bioSimRegs.RelDumYear)

# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
    meanPval.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DPVal.mean())
    meanCoef.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DCoef.mean())

# plot biotech regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
    if meanPval[i]<=0.1:
        ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
    else:
        ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Biotechnology Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()

ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
    if meanPval[i]<=0.1:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
    else:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Biotechnology Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()


Figure 23 Biotech regression results, only top 10% similarity The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable including only the top 10% of similarity scores. Circle markers represent estimates that are significant at the 10% confidence level.


In [12]:
bioSimRegs = pd.DataFrame.from_csv(difPath+'/bio_simTopRegMeans.csv')
relDumYears = list(bioSimRegs.RelDumYear)

# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
    meanPval.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DPVal.mean())
    meanCoef.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DCoef.mean())

# plot biotech regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
    if meanPval[i]<=0.1:
        ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
    else:
        ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Biotechnology Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()

ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
    if meanPval[i]<=0.1:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
    else:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Biotechnology Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()


Figure 24 AI regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.


In [13]:
aiSimRegs = pd.DataFrame.from_csv(difPath+'/ai_simRegMeans.csv')
relDumYears = list(aiSimRegs.RelDumYear)

# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
    meanPval.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DPVal.mean())
    meanCoef.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DCoef.mean())

fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
    if meanPval[i]<=0.1:
        ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
    else:
        ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Artificial Intelligence Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()

ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
    if meanPval[i]<=0.1:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
    else:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Artificial Intelligence Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.003,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()


Figure 25 AI regression results, only top 10% similarity The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable including only the top 10% of similarity scores. Circle markers represent estimates that are significant at the 10% confidence level.


In [14]:
aiSimRegs = pd.DataFrame.from_csv(difPath+'/ai_simTopRegMeans.csv')
relDumYears = list(aiSimRegs.RelDumYear)

# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
    meanPval.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DPVal.mean())
    meanCoef.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DCoef.mean())

fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
    if meanPval[i]<=0.1:
        ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
    else:
        ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Artificial Intelligence Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()

ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
    if meanPval[i]<=0.1:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
    else:
        ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Artificial Intelligence Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.003,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()