In [71]:
import glob
from io import open
import pandas as pd
from pandas import DataFrame as df
from os import path
import re
In [72]:
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
from tools import get_psycinfo_database
In [ ]:
words_df = get_psycinfo_database()
In [77]:
words_df.head()
Out[77]:
In [78]:
#words_df.to_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8',compression='bz2')
In [79]:
#psychinfo = pd.read_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8', compression='bz2')
psychinfo = words_df
In [80]:
abstract_occurrence = []
for x,y in psychinfo[["Term", "Abstract"]].fillna("").values:
if x.lower() in y.lower():
abstract_occurrence.append(1)
else:
abstract_occurrence.append(0)
psychinfo["term_in_abstract"] = abstract_occurrence
In [81]:
title_occurrence = []
for x,y in psychinfo[["Term", "Title"]].fillna("").values:
if x.lower() in y.lower():
title_occurrence.append(1)
else:
title_occurrence.append(0)
psychinfo["term_in_title"] = title_occurrence
In [82]:
psychinfo_search = psychinfo.drop('Abstract', 1)
psychinfo_search = psychinfo_search.drop('Title', 1)
In [83]:
term_ID = {"multiculturalism": 1, "polyculturalism": 2, "cultural pluralism": 3,
"monocultural": 4, "monoracial": 5, "bicultural": 6,
"biracial": 7, "biethnic": 8, "interracial": 9,
"multicultural": 10, "multiracial": 11, "polycultural": 12,
"polyracial": 13, "polyethnic": 14, "mixed race": 15,
"mixed ethnicity": 16, "other race": 17, "other ethnicity": 18}
In [84]:
psychinfo_search["term_ID"] = psychinfo_search.Term.map(term_ID)
In [85]:
psychinfo_search["Type of Book"].value_counts()
Out[85]:
In [86]:
type_of_book = { 'Handbook/Manual': 1, 'Textbook/Study Guide': 2, 'Conference Proceedings': 3,
'Reference Book': 2, 'Classic Book': 4,'Handbook/Manual\n\nTextbook/Study Guide': 5,
'Reference Book\n\nTextbook/Study Guide': 5,'Classic Book\n\nTextbook/Study Guide': 5,
'Handbook/Manual\n\nReference Book': 5,'Conference Proceedings\n\nTextbook/Study Guide': 5,
'Reference Book\r\rTextbook/Study Guide': 5,'Conference Proceedings\r\rTextbook/Study Guide': 5}
In [87]:
psychinfo_search["type_of_book"] = psychinfo_search["Type of Book"].map(type_of_book)
In [88]:
psychinfo_search["cited_references"] = psychinfo_search['Cited References'].map(lambda text:len(text.strip().split("\n")),"ignore")
In [89]:
psychinfo_search['Document Type'].value_counts()
Out[89]:
In [95]:
document_type = {'Journal Article': 1, 'Dissertation': 2, 'Chapter': 3, 'Review-Book': 4,
'Comment/Reply': 6, 'Editorial': 6, 'Chapter\n\nReprint': 3,
'Erratum/Correction': 6, 'Review-Media': 6, 'Abstract Collection': 6,
'Letter': 6, 'Obituary': 6, 'Chapter\n\nComment/Reply': 3, 'Column/Opinion': 6,
'Reprint': 5, 'Bibliography': 5, 'Journal Article\n\nReprint': 1,
'Chapter\r\rReprint': 3, 'Chapter\n\nJournal Article\n\nReprint': 3,
'Bibliography\n\nChapter': 3, 'Encyclopedia Entry': 5,
'Chapter\r\rJournal Article\r\rReprint': 3, 'Review-Software & Other': 6,
'Publication Information': 6, 'Journal Article\r\rReprint': 1,
'Reprint\n\nReview-Book': 4}
In [96]:
psychinfo_search['document_type'] = psychinfo_search['Document Type'].map(document_type)
In [97]:
psychinfo_search["conference_dich"] = psychinfo_search["Conference"].fillna("").map(lambda x: int((len(x) > 0)))
In [98]:
psychinfo_search['Publication Type'].value_counts()
Out[98]:
In [99]:
publication_type = {'Journal\n\nPeer Reviewed Journal': 1, 'Book\n\nEdited Book': 3,
'Dissertation Abstract': 2, 'Book\n\nAuthored Book': 3,
'Journal\r\rPeer Reviewed Journal': 1, 'Electronic Collection': 1,
'Journal\n\nPeer-Reviewed Status-Unknown': 1, 'Book\r\rEdited Book': 3,
'Book': 3, 'Journal\r\rPeer-Reviewed Status-Unknown': 1,
'Book\r\rAuthored Book': 3, 'Encyclopedia': 4}
In [100]:
psychinfo_search['publication_type'] = psychinfo_search['Publication Type'].map(publication_type)
In [111]:
(psychinfo_search["publication_type"] * psychinfo_search["conference_dich"]).value_counts()
Out[111]:
In [116]:
selection = (psychinfo_search["publication_type"] == 3) * (psychinfo_search["conference_dich"] == 1)
psychinfo_search[selection][["Publication Type", "Conference"]]
Out[116]:
In [25]:
psychinfo_search['Language'].value_counts()
Out[25]:
In [38]:
language = {'English': 1, 'French': 2, 'Spanish': 3, 'Italian': 4, 'German': 5, 'Portuguese': 6,
'Dutch': 7, 'Chinese': 8, 'Greek': 9, 'Hebrew': 10, 'Turkish': 10, 'Russian': 10,
'Serbo-Croatian': 10, 'Slovak': 10, 'Japanese': 10, 'Hungarian': 10, 'Czech': 10,
'Danish': 10, 'Romanian': 10, 'Polish': 10, 'Norwegian': 10, 'Swedish': 10, 'Finnish': 10,
'NonEnglish': 10, 'Arabic': 10, 'Afrikaans': 10}
In [39]:
psychinfo_search['language'] = psychinfo_search['Language'].map(language)
In [40]:
#psychinfo_search["PsycINFO Classification Code"].value_counts().to_csv("data/PsycInfo/processed/PsycINFO_Classification_Code.csv")
In [41]:
#psychinfo_search["Tests & Measures"].value_counts().to_csv("data/PsycInfo/processed/Tests_&_Measures.csv")
In [42]:
#psychinfo_search["Key Concepts"].value_counts().to_csv("data/PsycInfo/processed/Key_Concepts.csv")
In [43]:
#psychinfo_search["Location"].value_counts().to_csv("data/PsycInfo/processed/Location.csv")
In [44]:
#psychinfo_search["MeSH Subject Headings"].value_counts().to_csv("data/PsycInfo/processed/MeSH_Subject_Headings.csv")
In [45]:
#psychinfo_search["Journal Name"].value_counts().to_csv("data/PsycInfo/processed/Journal_Name.csv")
In [46]:
#psychinfo_search["Institution"].value_counts().to_csv("data/PsycInfo/processed/Institution.csv")
In [118]:
len(psychinfo_search["Population Group"].value_counts())
Out[118]:
In [117]:
#psychinfo_search["Methodology"].value_counts()
In [48]:
def GetCats(text):
pattern = re.compile("([0-9]+)")
results = [100*(int(x)//100) for x in pattern.findall(text)]
if len(set(results))>1:
return 4300
else:
return results[0]
In [49]:
psychinfo_search["PsycINFO_Classification_Code"] = psychinfo_search["PsycINFO Classification Code"].map(GetCats, "ignore")
In [52]:
lists = psychinfo["PsycINFO Classification Code"].map(GetCats, "ignore")
len(set([x for x in lists.dropna()]))
#Number of unique categories
Out[52]:
In [64]:
psychinfo_search["grants_sponsorship"] = psychinfo_search["Grant/Sponsorship"].fillna("").map(lambda x: int(len(x) > 0))
In [41]:
#psychinfo_search.to_csv("data/PsycInfo/processed/psychinfo_term_search.csv.bz2", encoding='utf-8', compression='bz2')
In [42]:
#psychinfo_search = psychinfo_search.drop('Title', 1)
In [126]:
#psychinfo_search["Methodology"].value_counts().to_csv("data/PsycInfo/Manual_Mapping/Methodology.csv")
In [127]:
#psychinfo_search["Population Group"].value_counts().to_csv("data/PsycInfo/Manual_Mapping/Population_Group.csv")
Keep the current spreadsheet and add the following:
***Once we extract the csv files for these columns, I will categorize them.
Once all of these corrections have been made, make a new spreadsheet and delete the following information:
In [121]:
len(psychinfo_search["Population Group"].value_counts())
Out[121]: