In [71]:
import glob
from io import open
import pandas as pd
from pandas import DataFrame as df
from os import path
import re

In [72]:
import matplotlib.pyplot as plt
%matplotlib inline

Merge CSV databases


In [ ]:
from tools import get_psycinfo_database

In [ ]:
words_df = get_psycinfo_database()

In [77]:
words_df.head()


Out[77]:
Abstract Accession Number Author(s) Type of Book PsycINFO Classification Code Conference Document Type Grant/Sponsorship Key Concepts Institution ... Population Group Publication Status Publication Type Publisher Cited References Title Tests & Measures Volume Date Term
0 PURPOSE: Rates of alcohol use may be increasin... Peer Reviewed Journal: 2015-52719-001. Kane, Jeremy C\n\nJohnson, Renee M\n\nRobinson... NaN Health & Mental Health Treatment & Prevention ... NaN NaN NaN Acculturation, Intergenerational cultural diss... NaN ... NaN First Posting Journal\n\nPeer Reviewed Journal Elsevier Science; Netherlands NaN The impact of intergenerational cultural disso... NaN NaN 2015 bicultural
1 Given the negative developmental risks associa... Peer Reviewed Journal: 2015-52548-001. Killoren, Sarah E\n\nZeiders, Katharine H\n\nU... NaN Developmental Psychology [2800]. NaN NaN NaN Adolescence, Cultural context, Mexican-America... Killoren, Sarah E.: Department of Human Develo... ... NaN First Posting Journal\n\nPeer Reviewed Journal Springer; Germany NaN The sociocultural context of mexican-origin pr... NaN NaN 2015 bicultural
2 (from the chapter) Assessment science is an es... Book: 2013-02670-011. Dana, Richard H Handbook/Manual Personality Scales & Inventories [2223]. NaN Chapter NaN personality tests, psychology, assessment, cul... NaN ... Human NaN Book\n\nEdited Book American Psychological Association; US Aiken, L. S., West, S. G., & Millsap, R. E. (2... Personality tests and psychological science: I... California Brief Multicultural Competency Scal... NaN 2014 bicultural
3 Objective: The aim of the study was to explore... Peer Reviewed Journal: 2015-46649-006. Goutaudier, N\n\nChauchard, E\n\nMelioli, T\n\... NaN Psychosocial & Personality Development [2840]. NaN Journal Article NaN Acculturation, Adolescence, Cluster analysis, ... Goutaudier, N.: Laboratoire CERPP-OCTOGONE, UF... ... Human. Male. Female. Adolescence (13-17 yrs) NaN Journal\n\nPeer Reviewed Journal Elsevier Masson SAS; France Aubry, B., & Tribalat, M. (2009). Les jeunes d... Acculturation orientations and psychosocial ad... Immigrant Acculturation Scale\nRosenberg Self-... 41 2015 bicultural
4 (from the chapter) In Germany, the visit of th... Book: 2014-27297-015. Leyendecker, Birgit\n\nWillard, Jessica\n\nAga... NaN Cognitive & Perceptual Development [2820]. NaN Chapter <b>Sponsor: </b>NORFACE. ERA-NET\n<b>Grant: </... children's bilingual development, parents, imm... Leyendecker, Birgit: Ruhr University Bochum, B... ... Human. Childhood (birth-12 yrs) NaN Book\n\nEdited Book Ashgate Publishing Co; US Adesope, O. O., Lavin, T., Thompson, T., & Ung... Learning a host country: A plea to strengthen ... NaN NaN 2014 bicultural

5 rows × 29 columns


In [78]:
#words_df.to_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8',compression='bz2')

Load PsychINFO unified database


In [79]:
#psychinfo = pd.read_csv("data/PsycInfo/processed/psychinfo_combined.csv.bz2", encoding='utf-8', compression='bz2')
psychinfo = words_df

Term appearance in abstract and title


In [80]:
abstract_occurrence = []
for x,y in psychinfo[["Term", "Abstract"]].fillna("").values:
  if x.lower() in y.lower():
    abstract_occurrence.append(1)
  else:
    abstract_occurrence.append(0)
psychinfo["term_in_abstract"] = abstract_occurrence

In [81]:
title_occurrence = []
for x,y in psychinfo[["Term", "Title"]].fillna("").values:
  if x.lower() in y.lower():
    title_occurrence.append(1)
  else:
    title_occurrence.append(0)
psychinfo["term_in_title"] = title_occurrence

In [82]:
psychinfo_search = psychinfo.drop('Abstract', 1)
psychinfo_search = psychinfo_search.drop('Title', 1)

In [83]:
term_ID = {"multiculturalism": 1, "polyculturalism": 2, "cultural pluralism": 3, 
           "monocultural": 4, "monoracial": 5, "bicultural": 6, 
           "biracial": 7, "biethnic": 8, "interracial": 9, 
           "multicultural": 10, "multiracial": 11, "polycultural": 12, 
           "polyracial": 13, "polyethnic": 14, "mixed race": 15, 
           "mixed ethnicity": 16, "other race": 17, "other ethnicity": 18}

In [84]:
psychinfo_search["term_ID"] = psychinfo_search.Term.map(term_ID)

In [85]:
psychinfo_search["Type of Book"].value_counts()


Out[85]:
Handbook/Manual                                   1395
Textbook/Study Guide                               533
Conference Proceedings                              53
Reference Book                                      45
Classic Book                                        25
Handbook/Manual\n\nTextbook/Study Guide             16
Reference Book\n\nTextbook/Study Guide               6
Classic Book\n\nTextbook/Study Guide                 2
Reference Book\r\rTextbook/Study Guide               1
Conference Proceedings\n\nTextbook/Study Guide       1
Handbook/Manual\n\nReference Book                    1
Conference Proceedings\r\rTextbook/Study Guide       1
Name: Type of Book, dtype: int64

In [86]:
type_of_book = { 'Handbook/Manual': 1, 'Textbook/Study Guide': 2, 'Conference Proceedings': 3,
       'Reference Book': 2, 'Classic Book': 4,'Handbook/Manual\n\nTextbook/Study Guide': 5,
       'Reference Book\n\nTextbook/Study Guide': 5,'Classic Book\n\nTextbook/Study Guide': 5,
       'Handbook/Manual\n\nReference Book': 5,'Conference Proceedings\n\nTextbook/Study Guide': 5,
       'Reference Book\r\rTextbook/Study Guide': 5,'Conference Proceedings\r\rTextbook/Study Guide': 5}

In [87]:
psychinfo_search["type_of_book"] = psychinfo_search["Type of Book"].map(type_of_book)

In [88]:
psychinfo_search["cited_references"] = psychinfo_search['Cited References'].map(lambda text:len(text.strip().split("\n")),"ignore")

In [89]:
psychinfo_search['Document Type'].value_counts()


Out[89]:
Journal Article                          14369
Dissertation                              4919
Chapter                                   4558
Review-Book                               1444
Comment/Reply                              548
Editorial                                  228
Chapter\n\nReprint                          78
Erratum/Correction                          66
Review-Media                                35
Abstract Collection                         29
Letter                                      18
Obituary                                    13
Chapter\n\nComment/Reply                    10
Reprint                                      9
Column/Opinion                               9
Bibliography                                 8
Journal Article\n\nReprint                   7
Chapter\r\rReprint                           6
Chapter\n\nJournal Article\n\nReprint        5
Encyclopedia Entry                           5
Bibliography\n\nChapter                      5
Chapter\r\rJournal Article\r\rReprint        2
Reprint\n\nReview-Book                       1
Publication Information                      1
Review-Software & Other                      1
Journal Article\r\rReprint                   1
Name: Document Type, dtype: int64

In [95]:
document_type = {'Journal Article': 1, 'Dissertation': 2, 'Chapter': 3, 'Review-Book': 4,
       'Comment/Reply': 6, 'Editorial': 6, 'Chapter\n\nReprint': 3,
       'Erratum/Correction': 6, 'Review-Media': 6, 'Abstract Collection': 6,
       'Letter': 6, 'Obituary': 6, 'Chapter\n\nComment/Reply': 3, 'Column/Opinion': 6,
       'Reprint': 5, 'Bibliography': 5, 'Journal Article\n\nReprint': 1,
       'Chapter\r\rReprint': 3, 'Chapter\n\nJournal Article\n\nReprint': 3,
       'Bibliography\n\nChapter': 3, 'Encyclopedia Entry': 5,
       'Chapter\r\rJournal Article\r\rReprint': 3, 'Review-Software & Other': 6,
       'Publication Information': 6, 'Journal Article\r\rReprint': 1,
       'Reprint\n\nReview-Book': 4}

In [96]:
psychinfo_search['document_type'] = psychinfo_search['Document Type'].map(document_type)

In [97]:
psychinfo_search["conference_dich"] = psychinfo_search["Conference"].fillna("").map(lambda x: int((len(x) > 0)))

In [98]:
psychinfo_search['Publication Type'].value_counts()


Out[98]:
Journal\n\nPeer Reviewed Journal           15714
Book\n\nEdited Book                         5402
Dissertation Abstract                       4919
Book\n\nAuthored Book                        890
Journal\r\rPeer Reviewed Journal             468
Electronic Collection                        454
Journal\n\nPeer-Reviewed Status-Unknown      234
Book\r\rEdited Book                          155
Book                                          30
Journal\r\rPeer-Reviewed Status-Unknown       14
Book\r\rAuthored Book                         13
Encyclopedia                                  11
Name: Publication Type, dtype: int64

In [99]:
publication_type = {'Journal\n\nPeer Reviewed Journal': 1, 'Book\n\nEdited Book': 3,
       'Dissertation Abstract': 2, 'Book\n\nAuthored Book': 3,
       'Journal\r\rPeer Reviewed Journal': 1, 'Electronic Collection': 1,
       'Journal\n\nPeer-Reviewed Status-Unknown': 1, 'Book\r\rEdited Book': 3,
       'Book': 3, 'Journal\r\rPeer-Reviewed Status-Unknown': 1,
       'Book\r\rAuthored Book': 3, 'Encyclopedia': 4}

In [100]:
psychinfo_search['publication_type'] = psychinfo_search['Publication Type'].map(publication_type)

In [111]:
(psychinfo_search["publication_type"] * psychinfo_search["conference_dich"]).value_counts()


Out[111]:
0    27380
1      773
3      151
dtype: int64

In [116]:
selection = (psychinfo_search["publication_type"] == 3) * (psychinfo_search["conference_dich"] == 1)
psychinfo_search[selection][["Publication Type", "Conference"]]


//anaconda/lib/python3.5/site-packages/pandas/computation/expressions.py:190: UserWarning: evaluating in Python space because the '*' operator is not supported by numexpr for the bool dtype, use '&' instead
  unsupported[op_str]))
Out[116]:
Publication Type Conference
707 Book\n\nEdited Book 2005 IEEE International Professional Communica...
800 Book\n\nEdited Book 2006 ACA Annual Convention. 2006. US. The arti...
801 Book\n\nEdited Book 2006 ACA Annual Convention. 2006. US. The arti...
1027 Book\n\nEdited Book Ontario Symposium on Personality and Social Ps...
1501 Book\n\nEdited Book Bienneial Meeting of the International Society...
1553 Book\n\nAuthored Book Earlier versions of several parts of this book...
1591 Book\n\nEdited Book Mental Health of Immigrants and Refugees. Mar,...
1607 Book\n\nEdited Book International Conference of the International ...
1638 Book\n\nEdited Book Conference on Childhood Bilingualism. Jun, 198...
410 Book\r\rEdited Book 24th Spring Meeting, Division 39, APA. Apr, 20...
439 Book\r\rEdited Book The annual meeting of the Association for Wome...
727 Book\r\rEdited Book The chapters contained in this book emerged fr...
750 Book\r\rAuthored Book Based on the Interactive Forum on Transference...
808 Book\r\rEdited Book An earlier version of this paper was presented...
152 Book\n\nEdited Book A major portion of the works in this volume or...
739 Book\n\nEdited Book First Universal Races Conference. 1. Jul, 1911...
897 Book\n\nEdited Book Gender & Empire. Oct, 2004. Otago University. ...
900 Book\n\nEdited Book Gender & Empire. Oct, 2004. University of Otag...
984 Book\n\nEdited Book Nebraska Symposium on Motivation. 53. The afor...
1180 Book\n\nEdited Book 24th Spring Meeting, Division 39, APA. Apr, 20...
1225 Book\n\nEdited Book The annual meeting of the Association for Wome...
1229 Book\n\nEdited Book The annual meeting of the Midwestern Conferenc...
1302 Book\n\nEdited Book Annual Meeting of the Society for the Study of...
1461 Book\n\nEdited Book Annual Conference of the American Association ...
34 Book\n\nEdited Book Joint Conference of the Internataional Society...
548 Book\n\nEdited Book A portion of the chapter by D. Byrne was read ...
585 Book\n\nEdited Book International Conference of the American Couns...
276 Book\n\nEdited Book Gender & Empire. Oct, 2004. Otago University. ...
438 Book\n\nEdited Book Gender and Power in Families. 1987. London. Un...
163 Book\n\nEdited Book Rutgers Invitational Symposium on Education. 1...
... ... ...
368 Book\n\nEdited Book National Conference on Education & Training in...
517 Book\n\nEdited Book Annual Convention of the American Psychologica...
640 Book\n\nEdited Book Rutgers Invitational Symposium on Education. 1...
942 Book\n\nEdited Book International Congress of the International As...
1026 Book\n\nEdited Book Minnesota Symposium on Child Psychology.. 29th...
1068 Book\n\nEdited Book International Interdisciplinary Conference on ...
1455 Book\n\nEdited Book International Conference on Computer Support f...
1498 Book\n\nEdited Book Annual Meeting of the National Reading Confere...
1520 Book\n\nEdited Book This volume grew out of a conference entitled ...
1568 Book\n\nEdited Book The papers in this volume are expansions of ta...
1578 Book\n\nEdited Book International Conference of the International ...
372 Book\n\nEdited Book European Conference on Traumatic Stress. 10th....
825 Book\n\nEdited Book 2006 ACA Annual Convention. 2006. US. The arti...
175 Book\n\nEdited Book Symposium XXI of the Association for the Advan...
177 Book\n\nEdited Book Youth in Cities: Successful Mediators of Norma...
228 Book\n\nEdited Book Symposium on the Psychosocial Consequences of ...
229 Book\n\nEdited Book Symposium on the Psychosocial Consequences of ...
335 Book\n\nEdited Book 27th Interamerican Congress of Psychology. Jun...
336 Book\n\nEdited Book International Congress of the International As...
381 Book\n\nEdited Book Conference on Sexual Orientation and the Law.....
453 Book\n\nEdited Book Annual Meeting of the National Reading Confere...
477 Book\n\nEdited Book An earlier version of this chapter was present...
478 Book\n\nEdited Book Portions of this chapter were presented at the...
488 Book\n\nEdited Book The collection presented here stems from a con...
509 Book\n\nEdited Book The Future of Literacy in a Changing World. Ma...
180 Book\n\nEdited Book 8th annual consumer culture theory conference....
674 Book\n\nEdited Book 2006 ACA Annual Convention. 2006. US. The arti...
779 Book\n\nEdited Book The annual meeting of the Midwestern Conferenc...
393 Book\n\nEdited Book Midwestern Psychological Association conventio...
670 Book\n\nEdited Book International Conference on Practical Aspects ...

151 rows × 2 columns


In [25]:
psychinfo_search['Language'].value_counts()


Out[25]:
English           27823
French               83
Spanish              78
Italian              42
German               41
Portuguese           31
Dutch                29
Chinese              22
Greek                10
Hebrew                7
Turkish               6
Serbo-Croatian        5
Russian               5
Slovak                4
Japanese              3
Hungarian             3
Czech                 2
Polish                2
Danish                2
Norwegian             2
Romanian              2
Afrikaans             1
NonEnglish            1
Swedish               1
Finnish               1
Arabic                1
Name: Language, dtype: int64

In [38]:
language = {'English': 1, 'French': 2, 'Spanish': 3, 'Italian': 4, 'German': 5, 'Portuguese': 6,
       'Dutch': 7, 'Chinese': 8, 'Greek': 9, 'Hebrew': 10, 'Turkish': 10, 'Russian': 10,
       'Serbo-Croatian': 10, 'Slovak': 10, 'Japanese': 10, 'Hungarian': 10, 'Czech': 10,
       'Danish': 10, 'Romanian': 10, 'Polish': 10, 'Norwegian': 10, 'Swedish': 10, 'Finnish': 10,
       'NonEnglish': 10, 'Arabic': 10, 'Afrikaans': 10}

In [39]:
psychinfo_search['language'] = psychinfo_search['Language'].map(language)

In [40]:
#psychinfo_search["PsycINFO Classification Code"].value_counts().to_csv("data/PsycInfo/processed/PsycINFO_Classification_Code.csv")

In [41]:
#psychinfo_search["Tests & Measures"].value_counts().to_csv("data/PsycInfo/processed/Tests_&_Measures.csv")

In [42]:
#psychinfo_search["Key Concepts"].value_counts().to_csv("data/PsycInfo/processed/Key_Concepts.csv")

In [43]:
#psychinfo_search["Location"].value_counts().to_csv("data/PsycInfo/processed/Location.csv")

In [44]:
#psychinfo_search["MeSH Subject Headings"].value_counts().to_csv("data/PsycInfo/processed/MeSH_Subject_Headings.csv")

In [45]:
#psychinfo_search["Journal Name"].value_counts().to_csv("data/PsycInfo/processed/Journal_Name.csv")

In [46]:
#psychinfo_search["Institution"].value_counts().to_csv("data/PsycInfo/processed/Institution.csv")

In [118]:
len(psychinfo_search["Population Group"].value_counts())


Out[118]:
349

In [117]:
#psychinfo_search["Methodology"].value_counts()

In [48]:
def GetCats(text):
  pattern = re.compile("([0-9]+)")
  results = [100*(int(x)//100) for x in pattern.findall(text)]
  if len(set(results))>1:
    return 4300 
  else:
    return results[0]

In [49]:
psychinfo_search["PsycINFO_Classification_Code"] = psychinfo_search["PsycINFO Classification Code"].map(GetCats, "ignore")

In [52]:
lists = psychinfo["PsycINFO Classification Code"].map(GetCats, "ignore")
len(set([x for x in lists.dropna()]))
#Number of unique categories


Out[52]:
23

In [64]:
psychinfo_search["grants_sponsorship"] = psychinfo_search["Grant/Sponsorship"].fillna("").map(lambda x: int(len(x) > 0))

In [41]:
#psychinfo_search.to_csv("data/PsycInfo/processed/psychinfo_term_search.csv.bz2", encoding='utf-8', compression='bz2')

In [42]:
#psychinfo_search = psychinfo_search.drop('Title', 1)

In [126]:
#psychinfo_search["Methodology"].value_counts().to_csv("data/PsycInfo/Manual_Mapping/Methodology.csv")

In [127]:
#psychinfo_search["Population Group"].value_counts().to_csv("data/PsycInfo/Manual_Mapping/Population_Group.csv")

PsycINFO Tasks

Keep the current spreadsheet and add the following:

  1. Add Term in Abstract to spreadsheet (word co-occurrence and control for the length of the abstract--lambda(len(abstract)) )do this for NSF/NIH data as well
  2. Add Term in Title to spreadsheet
  3. Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity
  4. Search all options in set for the following categories: -- I will manually categorize them once you give all options in each set
    1. "Type of Book"
    2. "PsycINFO Classification Code" 1. (used the classification codes[recoded to most basic category levels] -- subcategories created by PsycInfo (22)-- multiple categories = 4300)
    3. "Document Type"
    4. "Grant/Scholarship"
      1. (create a dichotomized variable 0/1)
    5. "Tests & Measures"--> csv (no longer necessary)
      1. (Too many categories---needs to be reviewed manually/carefully in excel)
    6. "Publication Type"
    7. "Publication Status"
    8. "Population Group"
      1. (Need to be mapped manually and then recategorized)
      2. We need: gender, age (abstract, years)
    9. "Methodology"
      1. (can make specific methods dichotomous--may remove if unnecessary)
    10. "Conference"
      1. Right now, this is text (~699 entries)--> dichotomize variable. If it is a conference ie there is a text = 1, if there is NaN = 0.
      2. Then, I will incorporate this as a new category in "Publication Type" and remove this column).??? [not currently included as a category--overlaps with category 3 in Publication Type = Books]
    11. "Key Concepts"--> csv
      1. (word co-occurrence)
    12. "Location"-->csv--> sent to Barbara
      1. (categorized by region--multiple regions)
    13. "Language" 1. I am not sure about my "other" language (10) category -- I put everything with less than 10 entries into one category.
    14. "MeSH Subject Headings"--> csv (may no longer be necessary?)
      1. (word co-occurrence)
    15. "Journal Name"-->csv--> sent to Jian Xin
      1. (categorized by H-index in 2014)
    16. "Institution"-->csv --> sent to Barbara
      1. (categorized by state, region & country)
  5. Count the number of cited references for each entry

***Once we extract the csv files for these columns, I will categorize them.

Once all of these corrections have been made, make a new spreadsheet and delete the following information:

  1. Volume
  2. Publisher
  3. Accession Number
  4. Author(s)
  5. Issue
  6. Cited References
  7. Publication Status (had no variance)--only first posting
  8. Document Type???

In [121]:
len(psychinfo_search["Population Group"].value_counts())


Out[121]:
349