In [1]:
import re
import csv
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import statsmodels.api as sm
import datetime
difPath = 'd:/diffusion_data'
In [3]:
# import abstracts
bioAbs = pd.DataFrame.from_csv(difPath+'/bio_abstracts_prep.csv')
aiAbs = pd.DataFrame.from_csv(difPath+'/ai_abstracts_prep.csv')
# import patents
bioPats = pd.DataFrame.from_csv(difPath+'/bio_patents_prep.csv')
aiPats = pd.DataFrame.from_csv(difPath+'/ai_patents_prep.csv')
# import similarity data
bioSim = pd.DataFrame.from_csv(difPath+'/bio_similarity_prep.csv')
aiSim = pd.DataFrame.from_csv(difPath+'/ai_similarity_prep.csv')
Table 1 Patent and paper abstract descriptive statistics
In [6]:
print 'num ai abstracts:',len(aiAbs)
print 'ai abstract year range:',min(aiAbs.JRefYear),max(aiAbs.JRefYear)
print 'ai abstract length stats:\n',aiAbs.AbsLen.describe()
print '\n\n'
print 'num ai patents:',len(aiPats)
print 'ai pat year range:',min(aiPats.gyear),max(aiPats.gyear)
print 'ai pat abstract length stats:\n',aiPats.AbsLen.describe()
print '\n\n'
print 'num bio abstracts:',len(bioAbs)
print 'bio abstract year range:',min(bioAbs.JRefYear),max(bioAbs.JRefYear)
print 'bio abstract length stats:\n',bioAbs.AbsLen.describe()
print '\n\n'
print 'num bio patents:',len(bioPats)
print 'bio pat year range:',min(bioPats.gyear),max(bioPats.gyear)
print 'bio pat abstract length stats:\n',bioPats.AbsLen.describe()
Figure 19 Bernard et al. (2003) similarity regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.
In [7]:
testSim = pd.DataFrame.from_csv(difPath+'/simTC_simRegRaw.csv')
relDumYears = list(testSim.RelDumYear)
# create lists of values to iterate over when plotting
pVal = list(testSim.DPVal)
coef = list(testSim.DCoef)
# plot test case regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(pVal)):
if pVal [i]<=0.1:
ax1.scatter(relDumYears[i],pVal[i], marker='o', c='b',s=40)
else:
ax1.scatter(relDumYears[i],pVal[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Matching words and pictures\nOLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()
ax2 = fig.add_subplot(122)
for i in range(len(coef)):
if pVal[i]<=0.1:
ax2.scatter(relDumYears[i],coef[i], marker='o', c='b',s=40)
else:
ax2.scatter(relDumYears[i],coef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Matching words and pictures\nOLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()
Figure 20 Histogram biotech similarity Histograms of similarity scores for biotech before (left) and after (right) patent publication. The solid red bar shows the mean similarity for the histogram.
In [8]:
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
ax1.hist(bioSim[bioSim.RelYear<=0].Similarity,bins=50)
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Similarity')
ax1.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures Before Patent Publication')
meanBefore = bioSim[bioSim.RelYear<=0].Similarity.mean()
ax1.axvline(x=meanBefore, color='r', ls='-', lw=2)
ax1.set_xlim(0,bioSim.Similarity.max()*1.1)
ax1.grid()
ax2 = fig.add_subplot(122)
ax2.hist(bioSim[bioSim.RelYear>0].Similarity,bins=50)
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Similarity')
ax2.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures After Patent Publication')
meanAfter = bioSim[bioSim.RelYear>0].Similarity.mean()
ax2.axvline(x=meanAfter, color='r', ls='-', lw=2)
ax2.set_xlim(0,bioSim.Similarity.max()*1.1)
ax2.grid()
plt.show()
Figure 21 Histogram AI similarity Histograms of similarity scores for AI before (left) and after (right) patent publication. The solid red bar shows the mean similarity for the histogram.
In [9]:
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
ax1.hist(aiSim[aiSim.RelYear<=0].Similarity,bins=50)
ax1.set_ylabel('Frequency')
ax1.set_xlabel('Similarity')
ax1.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures Before Patent Publication')
meanBefore = aiSim[aiSim.RelYear<=0].Similarity.mean()
ax1.axvline(x=meanBefore, color='r', ls='-', lw=2)
ax1.set_xlim(0,aiSim.Similarity.max()*1.1)
ax1.grid()
ax2 = fig.add_subplot(122)
ax2.hist(aiSim[aiSim.RelYear>0].Similarity,bins=50)
ax2.set_ylabel('Frequency')
ax2.set_xlabel('Similarity')
ax2.set_title('Histogram Biotechnology Patent Similarity\nSimilarity Measures After Patent Publication')
meanAfter = aiSim[aiSim.RelYear>0].Similarity.mean()
ax2.axvline(x=meanAfter, color='r', ls='-', lw=2)
ax2.set_xlim(0,aiSim.Similarity.max()*1.1)
ax2.grid()
plt.show()
Table 2 Similarity measures descriptive statistics
In [10]:
print 'before patent publication\n'
print 'num bio sim calcs:',len(bioSim[bioSim.RelYear<0])
print 'bio sim stats:\n',bioSim[bioSim.RelYear<0].Similarity.describe()
print 'num ai sim calcs:',len(aiSim[aiSim.RelYear<0])
print 'ai sim stats:\n',aiSim[aiSim.RelYear<0].Similarity.describe()
print '\n\n'
print 'on/after patent publication\n'
print 'num bio sim calcs:',len(bioSim[bioSim.RelYear>=0])
print 'bio sim stats:\n',bioSim[bioSim.RelYear>=0].Similarity.describe()
print 'num ai sim calcs:',len(aiSim[aiSim.RelYear>=0])
print 'ai sim stats:\n',aiSim[aiSim.RelYear>=0].Similarity.describe()
Figure 22 Biotech regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.
In [11]:
bioSimRegs = pd.DataFrame.from_csv(difPath+'/bio_simRegMeans.csv')
relDumYears = list(bioSimRegs.RelDumYear)
# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
meanPval.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DPVal.mean())
meanCoef.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DCoef.mean())
# plot biotech regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
if meanPval[i]<=0.1:
ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
else:
ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Biotechnology Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()
ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
if meanPval[i]<=0.1:
ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
else:
ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Biotechnology Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()
Figure 23 Biotech regression results, only top 10% similarity The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable including only the top 10% of similarity scores. Circle markers represent estimates that are significant at the 10% confidence level.
In [12]:
bioSimRegs = pd.DataFrame.from_csv(difPath+'/bio_simTopRegMeans.csv')
relDumYears = list(bioSimRegs.RelDumYear)
# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
meanPval.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DPVal.mean())
meanCoef.append(bioSimRegs[bioSimRegs.RelDumYear==relYr].DCoef.mean())
# plot biotech regression results
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
if meanPval[i]<=0.1:
ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
else:
ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Biotechnology Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()
ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
if meanPval[i]<=0.1:
ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
else:
ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Biotechnology Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.0005,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()
Figure 24 AI regression results The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable. Circle markers represent estimates that are significant at the 10% confidence level.
In [13]:
aiSimRegs = pd.DataFrame.from_csv(difPath+'/ai_simRegMeans.csv')
relDumYears = list(aiSimRegs.RelDumYear)
# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
meanPval.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DPVal.mean())
meanCoef.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DCoef.mean())
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
if meanPval[i]<=0.1:
ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
else:
ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Artificial Intelligence Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()
ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
if meanPval[i]<=0.1:
ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
else:
ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Artificial Intelligence Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.003,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()
Figure 25 AI regression results, only top 10% similarity The plots show mean p-values (left) and coefficients (right) for each relative dummy year variable including only the top 10% of similarity scores. Circle markers represent estimates that are significant at the 10% confidence level.
In [14]:
aiSimRegs = pd.DataFrame.from_csv(difPath+'/ai_simTopRegMeans.csv')
relDumYears = list(aiSimRegs.RelDumYear)
# create lists of mean values to iterate over when plotting
meanPval = []
meanCoef = []
for relYr in relDumYears:
meanPval.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DPVal.mean())
meanCoef.append(aiSimRegs[aiSimRegs.RelDumYear==relYr].DCoef.mean())
fig = plt.figure(figsize=(13,5))
ax1 = fig.add_subplot(121)
for i in range(len(meanPval)):
if meanPval[i]<=0.1:
ax1.scatter(relDumYears[i],meanPval[i], marker='o', c='b',s=40)
else:
ax1.scatter(relDumYears[i],meanPval[i], marker='x', c='r',s=50)
ax1.set_ylabel('Mean P-Value')
ax1.set_xlabel('Dummy Year Relative to Publication')
ax1.set_title('Artificial Intelligence Patent Dummy Regression P-Values\nMean OLS P-Values by Relative Dummy Year')
ax1.set_ylim(0,1)
ax1.axvline(x=0, color='r', ls='--', lw=2)
ax1.text(1,0.7,'Year Granted',fontsize=12,)
ax1.grid()
ax2 = fig.add_subplot(122)
for i in range(len(meanCoef)):
if meanPval[i]<=0.1:
ax2.scatter(relDumYears[i],meanCoef[i], marker='o', c='b',s=40)
else:
ax2.scatter(relDumYears[i],meanCoef[i], marker='x', c='r',s=50)
ax2.set_ylabel('Mean Coefficient')
ax2.set_xlabel('Dummy Year Relative to Publication')
ax2.set_title('Artificial Intelligence Patent Dummy Regression Coefficients\nMean OLS Coefficients by Relative Dummy Year')
ax2.axvline(x=0, color='r', ls='--', lw=2)
ax2.text(1,0.003,'Year Granted',fontsize=12,)
ax2.grid()
#plt.savefig('pat_diffusion.png')
plt.show()