papers by year and keyword

parameter:


In [ ]:
#parameter:
database="wos12b"
searchterm="big data" #lowecase!

installs


In [ ]:
import cx_Oracle #ensure that OS, InstantClient (Basic, ODBC, SDK) and cx_Oracle are all 64 bit. Install with "pip install cx_Oracle". Add link to InstantClient in Path variable!
import pandas as pd
import plotly    #cmd: conda install plotly
plotly.__version__
# (*) To communicate with Plotly's server, sign in with credentials file
import plotly.plotly as py
import cufflinks as cf
# (*) Useful Python/Plotly tools

import random

from plotly.grid_objs import Grid, Column
import time

db-connection


In [ ]:
dsn_tns=cx_Oracle.makedsn('127.0.0.1','6025',service_name='bibliodb01.fiz.karlsruhe')
 #open connection:
db=cx_Oracle.connect(<username>, <password>, dsn_tns)
print(db.version)

functions


In [ ]:
#%% functions:
def read_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute( query )
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        return pd.DataFrame( rows, columns=names)
    finally:
        if cursor is not None:
            cursor.close()

def array_to_string(arr):
    returnstring="("
    firstval=True
    for value in arr:
        if firstval:
            returnstring = returnstring + "'"+str(value)+"'"
            firstval=False
        else:
            returnstring = returnstring + ",'"+str(value)+"'"
    returnstring=returnstring+")"
    return returnstring
    

def get_random_color(pastel_factor = 0.5):
    return [(x+pastel_factor)/(1.0+pastel_factor) for x in [random.uniform(0,1.0) for i in [1,2,3]]]

def color_distance(c1,c2):
    return sum([abs(x[0]-x[1]) for x in zip(c1,c2)])

def generate_new_color(existing_colors,pastel_factor = 0.5):
    max_distance = None
    best_color = None
    for i in range(0,100):
        color = get_random_color(pastel_factor = pastel_factor)
        if not existing_colors:
            return color
        best_distance = min([color_distance(color,c) for c in existing_colors])
        if not max_distance or best_distance > max_distance:
            max_distance = best_distance
            best_color = color
    return best_color    
    
    
def get_all_papers_json(year,class_category):
    yearint=int(year)
    #get right papers:
    return dataset_original[(dataset_original['PUBYEAR']==yearint) & (dataset_original[CONSTGrouping]==class_category)].loc[:,['DOI','ARTICLE_TITLE','PUBYEAR','KEYWORD']].drop_duplicates().to_json(orient='records')
    
def get_all_papers(year,class_category): #html table!
    yearint=int(year)
    #get right papers:
    outputdf=dataset_original[(dataset_original['PUBYEAR']==yearint) & (dataset_original[CONSTGrouping]==class_category)].loc[:,['DOI','ARTICLE_TITLE','PUBYEAR','KEYWORD']].drop_duplicates()
    outputdf['DOI']='<a href="http://dx.doi.org/' + outputdf['DOI'] + '" target="_blank">'+ outputdf['DOI'] +'</a>'
    #set displaywith to max, as to_html would truncate otherwise:
    old_width = pd.get_option('display.max_colwidth')
    pd.set_option('display.max_colwidth', -1)    
    returnstring=outputdf.to_html(index=False,escape=False)
    pd.set_option('display.max_colwidth', old_width)
    return returnstring
    


def make_hover_text(year,class_category):
    yearint=int(year)
    
    returntext='%s\
    <br>Number of Papers: %s \
    <br>Average number of authors: %s  \
    <br>Average number references: %s  \
    <br>Average number of authors\' countries: %s  \
    <br>Average number of authors\' institutions: %s  \
    <br>Average number of citations: %s ' \
    % (getvalue(yearint,class_category,CONSTGrouping)
    , getvalue(yearint,class_category,'NUM_PAPERS')
    , getvalue(yearint,class_category,'AUTHOR_MEAN')
    , getvalue(yearint,class_category,'REF_MEAN')
    , getvalue(yearint,class_category,'COUNTRY_MEAN')
    , getvalue(yearint,class_category,'INST_MEAN')
    , getvalue(yearint,class_category,'CIT_MEAN')) 
    
    print(returntext)
    return returntext
        
def getvalue(year,class_category,attribute):
    returnval=dataset[(dataset['PUBYEAR']==year) & (dataset[CONSTGrouping]==class_category)][attribute]
    if returnval.empty:
        return ""
    else:
        if isinstance(returnval.values[0], str):
            return returnval.values[0]
        else:
            return round(returnval.values[0],2)

get data from db

note that access to the db is restricted to members of the competence centre of bibliometrics (http://www.bibliometrie.info/) and its cooperation partners. However, you can continue with the csv stored at the end of this block.


In [ ]:
#%% Load all existing keywords with the term big data in it (and their corresponding primary keys:

command="""SELECT kw,pk_kw FROM  
(SELECT lower(wos12b.KEYWORDS.KEYWORD) kw,PK_KEYWORDS pk_kw 
FROM """+database+""".KEYWORDS, """+database+""".ITEMS_KEYWORDS, """+database+""".ITEMS 
 WHERE
 """+database+""".ITEMS_KEYWORDS.FK_KEYWORDS="""+database+""".KEYWORDS.PK_KEYWORDS
 AND """+database+""".ITEMS.PK_ITEMS="""+database+""".ITEMS_KEYWORDS.FK_ITEMS  
 AND lower("""+database+""".KEYWORDS.KEYWORD) LIKE '%"""+searchterm+"""%'
)
GROUP BY kw,pk_kw
ORDER BY kw DESC
"""

df=read_query(db,command)

# we already matched some different forms of writing keywords by merging all 
# to lcase. Remove " and ' for further merging:

df['KW'].replace(regex=True,inplace=True,to_replace=r'\'',value=r'')
df['KW'].replace(regex=True,inplace=True,to_replace=r'\"',value=r'')
df= df.sort_values(by=['KW', 'PK_KW'], ascending=[1, 1])
df.to_csv("keywords.csv", sep=';')

#%% get list of single keywords:

kw_unique=df['KW'].unique()
# get ids per keyword:
indextable ={}
for keyword in kw_unique:
    arrkeyword_ids=df[df['KW']==keyword]['PK_KW'].unique()
    strkeyword_ids=array_to_string(arrkeyword_ids)
    indextable[keyword]=arrkeyword_ids

#%% now get all papers 

df2_all_keywords = pd.DataFrame()
i=1
print('loading papers for keyword:')
for keyword in kw_unique:
    print('...'+keyword + ' ('+str(i)+' of '+str(len(kw_unique))+')')
    arrkeyword_ids=df[df['KW']==keyword]['PK_KW'].unique()
    strkeyword_ids=array_to_string(arrkeyword_ids)
    command="""SELECT ROUND(PUBYEAR,0) as PUBYEAR, CLASSIFICATION, DOI, ARTICLE_TITLE,
AUTHOR_CNT, 
REF_CNT, 
COUNTRY_CNT, 
INST_CNT,
cc.COUNT AS CIT_CNT 
FROM 
"""+database+""".ITEMS i, 
"""+database+""".ITEMS_KEYWORDS i_k,
"""+database+""".CITINGCOUNTS cc,
"""+database+""".ITEMS_CLASSIFICATIONS i_c,
"""+database+""".CLASSIFICATIONS c 
WHERE i.PK_ITEMS=i_k.FK_ITEMS 
AND i_c.FK_ITEMS=i.PK_ITEMS 
AND i_c.FK_CLASSIFICATIONS=c.PK_CLASSIFICATIONS 
AND cc.FK_ITEMS=i.PK_ITEMS 
AND PUBTYPE='J' 
AND FK_KEYWORDS IN  
"""+strkeyword_ids+""" 
"""
    
    df2=read_query(db,command)
 
    df2['KEYWORD']=keyword
    df2_all_keywords=df2_all_keywords.append(df2)
    i=i+1

df2_all_keywords.PUBYEAR=df2_all_keywords.PUBYEAR.astype(int)
df2_all_keywords.to_csv("datafromdb.csv", sep=';')

group data


In [ ]:
CONSTGrouping="KEYWORD" #"CLASSIFICATION
dataset_original=pd.read_csv("datafromdb.csv",sep=";")
dataset_original= dataset_original[dataset_original.PUBYEAR != 2015]
dataset_original=dataset_original.drop(dataset_original.columns[[0]], axis=1)
#group by to get averages:
dataset=dataset_original.drop_duplicates(['PUBYEAR',CONSTGrouping,'DOI','ARTICLE_TITLE'])
grouped=dataset.groupby(['PUBYEAR',CONSTGrouping])           
                
dataset=grouped.agg('mean',)\
             .rename(columns = lambda x: x.replace("_CNT","_MEAN"))\
             .join(pd.DataFrame(grouped.size(), 
                                columns=['NUM_PAPERS'])).reset_index()

build plotly grid


In [ ]:
years_from_col = set(round(dataset['PUBYEAR'],0))
years_ints = sorted(list(years_from_col))
years = [str(year) for year in years_ints]


# make list of class_categories (can be 'KEYWORD' or 'CLASSIFICATION')
class_categories = []
for class_category in dataset[CONSTGrouping]:
    if class_category not in class_categories: 
        class_categories.append(class_category)

columns = []
# make grid
for year in years:
    for class_category in class_categories:
        dataset_by_year = dataset[dataset['PUBYEAR'] == int(year)]
        dataset_by_year_and_keyw = dataset_by_year[dataset_by_year[CONSTGrouping] == class_category]
        for col_name in dataset_by_year_and_keyw:
            # each column name is unique
            column_name = '{year}_{class_category}_{header}_grid'.format(
                year=year, class_category=class_category, header=col_name
            )
            a_column = Column(list(dataset_by_year_and_keyw[col_name]), column_name)
            columns.append(a_column)

# upload grid
grid = Grid(columns)
url = py.grid_ops.upload(grid, 'grid'+str(time.time()), auto_open=False)
url

prepare layout, sliderdict and colors


In [ ]:
CONSTxaxis='INST_MEAN'
CONSTyaxis='REF_MEAN'
CONSTsize='NUM_PAPERS'

#AUTHOR_MEAN, REF_MEAN, COUNTRY_MEAN, INST_MEAN, CIT_MEAN 


CONSTduration=1500

#make figure:
figure = {
    'data': [],
    'layout': {},
    'frames': [],
    'config': {'scrollzoom': True}
}

# fill in most of layout
figure['layout']['xaxis'] = {'range': [0, 12], 'title': CONSTxaxis, 'gridcolor': '#FFFFFF'}
figure['layout']['yaxis'] = {'range': [-5, 100],'title': CONSTyaxis, 'gridcolor': '#FFFFFF'}
figure['layout']['hovermode'] = 'closest'
figure['layout']['plot_bgcolor'] = 'rgb(240, 245, 250)'
figure['layout']['margin'] = {'r': 290}
figure['layout']['slider'] = {
    'args': [
        'slider.value', {
            'duration': CONSTduration,
            'ease': 'cubic-in-out'
        }
    ],
    'initialValue': '2009',
    'plotlycommand': 'animate',
    'values': years,
    'visible': True
}
figure['layout']['showlegend']=True;
figure['layout']['legend']={'x':1,'y':1,'font':{'size':10}};
figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': CONSTduration, 'redraw': True},
                         'fromcurrent': True, 'transition': {'duration': CONSTduration, 'easing': 'quadratic-in-out'}}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': True}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]




sliders_dict = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}

custom_colors={}
for class_category in dataset[CONSTGrouping].unique():
    r = lambda: random.randint(0,255)
    custom_colors[class_category]='rgb('+str(r())+','+str(r())+','+str(r())+')'

define starting plot data


In [ ]:
col_name_template = '{year}_{class_category}_{header}_grid'

year = 2009
for class_category in class_categories:
    data_dict = {
        'xsrc': grid.get_column_reference(col_name_template.format(
            year=year, class_category=class_category, header=CONSTxaxis
        )),
        'ysrc': grid.get_column_reference(col_name_template.format(
            year=year, class_category=class_category, header=CONSTyaxis
        )),
        'mode': 'markers',
        'text': make_hover_text(year,class_category),
        'paper':get_all_papers(year,class_category),
        'hoverinfo':'text',
        'marker': {
            'sizemode': 'area',
            'sizeref':  0.02,
            'sizesrc': grid.get_column_reference(col_name_template.format(
                 year=year, class_category=class_category, header=CONSTsize
            )),
            'color': custom_colors[class_category]
        },
        'name': class_category[:50]
    }
    figure['data'].append(data_dict)

define frames for each later year


In [ ]:
for year in years:
    frame = {'data': [], 'name': str(year)}
    for class_category in class_categories:
        data_dict = {
            'xsrc': grid.get_column_reference(col_name_template.format(
                year=year, class_category=class_category, header=CONSTxaxis
            )),
            'ysrc': grid.get_column_reference(col_name_template.format(
                year=year, class_category=class_category, header=CONSTyaxis
            )),
            'mode': 'markers',
            'text': make_hover_text(year,class_category),
            'paper':get_all_papers(year,class_category),
             'hoverinfo':'text',
            #'textsrc': grid.get_column_reference(col_name_template.format(
        #    year=year, class_category=class_category, header=CONSTGrouping
       # )),
            'marker': {
                'sizemode': 'area',
                'sizeref': 0.02,
                'sizesrc': grid.get_column_reference(col_name_template.format(
                    year=year, class_category=class_category, header=CONSTsize
                )),
                'color': custom_colors[class_category],
                'opacity':0.6
            },
            'name': class_category[:50]
        }
        frame['data'].append(data_dict)

    figure['frames'].append(frame)
    slider_step = {'args': [
        [year],
        {'frame': {'duration': CONSTduration, 'redraw': True},
         'mode': 'immediate',
       'transition': {'duration': CONSTduration}}
     ],
     'label': year,
     'method': 'animate'}
    sliders_dict['steps'].append(slider_step)

finally, plot/upload


In [ ]:
figure['layout']['sliders'] = [sliders_dict]
#plot(figure)
py.icreate_animations(figure, 'BigDataPapers'+str(time.time()),auto_open=True)