In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("83f8a712c4cf05ce7492cfade8170169cca301a5", compression="gzip")

In [3]:
df.head()


Out[3]:
Collection Name Provider Name Restricted for Search Restricted for Delivery Delivery Method Contains Newspapers Open Access Resource Type Fulltext Searchable Language of Materials covered Subject Headings Keywords Index Coverage Record Count
0 oaFindr 1science Yes Yes Link in Record No No Articles, Reviews No Weekly oaFindr indexes Open Access peer reviewed scho... 100% 28138247
1 African American Experience (Academic) ABC-CLIO No Yes Link in Record No No Text Resources, Articles, Images, Audio, Video No Weekly The African American Experience: The American ... 100% 9017
2 American Government (Academic) ABC-CLIO No Yes Link in Record No No Articles, Images, Text Resources, Video, Maps No Weekly American Government hosts a selection of refer... 100% 17136
3 American History (Academic) ABC-CLIO No Yes Link in Record No No Articles, Text Resources, Images, Video, Maps No Weekly American History provides a survey of American... 100% 23293
4 American Indian Experience (Academic) ABC-CLIO No Yes Link in Record No No Articles, Text Resources, Images, Video, Maps No Weekly The American Indian Experience: The American M... 100% 4534

In [4]:
len(df)


Out[4]:
4328

In [5]:
df["Fulltext Searchable"] = df["Fulltext Searchable"].str.strip()
df.loc[df["Fulltext Searchable"] == "0", "Fulltext Searchable"] = "No"
df.loc[df["Fulltext Searchable"] == "no", "Fulltext Searchable"] = "No"

In [6]:
df.groupby("Fulltext Searchable").size()


Out[6]:
Fulltext Searchable
No         2515
Partial     201
Unknown       1
Yes        1611
dtype: int64

In [7]:
ax = df.groupby("Fulltext Searchable").size().plot(kind="pie",
                                                   title="Fulltext Searchable (Collection, N=4328)",
                                                   figsize=(10, 6))
ax.set_ylabel(None);



In [8]:
df.sort_values(by="Record Count", ascending=False).loc[:, ["Collection Name", "Record Count"]].head(20)


Out[8]:
Collection Name Record Count
3100 ProQuest Central 261326717
3105 ProQuest Central Korea 249665389
3104 ProQuest Central Essentials 185653721
2794 Business Premium Collection 162678593
2725 ABI/INFORM Collection 154165112
2795 Business Premium Collection (Alumni edition) 142155895
1761 General OneFile** 140772336
2726 ABI/INFORM Collection (Alumni edition) 135380135
3106 ProQuest Central Student 122334753
1740 Espacenet 117328005
3103 ProQuest Central China 108860423
3429 SciTech Premium Collection 105284623
1752 Business Insights: Essentials 95706570
1749 Academic OneFile** 92385299
2733 ABI/INFORM Trade & Industry 89185788
2734 ABI/INFORM Trade & Industry (Alumni edition) 88065800
2727 ABI/INFORM Collection China 86081392
1628 Scopus 76721685
1425 *** ALL CrossRef Collections *** 73361943
3421 Research Library Prep 62582351

In [9]:
df["Fulltext Searchable"] = df["Fulltext Searchable"].str.strip()

In [10]:
df.groupby("Fulltext Searchable").size()


Out[10]:
Fulltext Searchable
No         2515
Partial     201
Unknown       1
Yes        1611
dtype: int64

In [11]:
df[df["Fulltext Searchable"] == "Yes"].sort_values(by="Record Count", ascending=False).loc[:,
    ["Collection Name", "Provider Name", "Fulltext Searchable", "Record Count"]].head(30)


Out[11]:
Collection Name Provider Name Fulltext Searchable Record Count
3100 ProQuest Central ProQuest Yes 261326717
3105 ProQuest Central Korea ProQuest Yes 249665389
3104 ProQuest Central Essentials ProQuest Yes 185653721
2794 Business Premium Collection ProQuest Yes 162678593
2725 ABI/INFORM Collection ProQuest Yes 154165112
1761 General OneFile** Gale Yes 140772336
3106 ProQuest Central Student ProQuest Yes 122334753
1752 Business Insights: Essentials Gale Yes 95706570
1749 Academic OneFile** Gale Yes 92385299
2733 ABI/INFORM Trade & Industry ProQuest Yes 89185788
3387 ProQuest Research Library ProQuest Yes 60352760
2965 Health Research Premium Collection ProQuest Yes 57455524
2728 ABI/INFORM Dateline ProQuest Yes 54168637
2959 Global Newsstream (Alumni) ProQuest Yes 44487303
3281 ProQuest Historical Newspapers: U.S. Major Dai... ProQuest Yes 43341112
3341 ProQuest Pharma Collection ProQuest Yes 43167625
2778 Biological Science Collection ProQuest Yes 43035717
3099 ProQuest Career and Technical Education ProQuest Yes 26289436
3272 ProQuest Historical Newspapers: The New York T... ProQuest Yes 26098963
3180 ProQuest Engineering Collection ProQuest Yes 25856867
2738 Advanced Technologies & Aerospace Collection ProQuest Yes 25310664
2730 ABI/INFORM Global ProQuest Yes 22217065
3224 ProQuest Historical Newspapers: International ... ProQuest Yes 20263286
2964 Health & Medical Collection ProQuest Yes 19491441
3336 ProQuest Military Collection ProQuest Yes 18758209
2745 Agricultural & Environmental Science Collection ProQuest Yes 17174449
3277 ProQuest Historical Newspapers: The Washington... ProQuest Yes 17026127
3477 U.S. Newsstream (Alumni) ProQuest Yes 16309427
3182 ProQuest Environmental Science Collection ProQuest Yes 16294236
1762 Health Reference Center Academic Gale Yes 16263422

In [12]:
df[df["Fulltext Searchable"] == "No"].sort_values(by="Record Count", ascending=False).loc[:,
    ["Collection Name", "Provider Name", "Fulltext Searchable", "Record Count"]].head(30)


Out[12]:
Collection Name Provider Name Fulltext Searchable Record Count
2795 Business Premium Collection (Alumni edition) ProQuest No 142155895
2726 ABI/INFORM Collection (Alumni edition) ProQuest No 135380135
1740 Espacenet European Patent Office No 117328005
3103 ProQuest Central China ProQuest No 108860423
2734 ABI/INFORM Trade & Industry (Alumni edition) ProQuest No 88065800
2727 ABI/INFORM Collection China ProQuest No 86081392
1628 Scopus Elsevier No 76721685
1425 *** ALL CrossRef Collections *** CrossRef No 73361943
3421 Research Library Prep ProQuest No 62582351
1421 The Chinese Science and Technology Periodical ... CQVIP (Chongqing VIP Information Co., Ltd.) No 60500487
3419 Research Library (Alumni edition) ProQuest No 60069938
2966 Health Research Premium Collection (Alumni edi... ProQuest No 55274140
3086 Periodicals Index Online ProQuest No 40752143
4301 China Online Journals (COJ) Wanfang Data Co. Ltd. No 33624400
4260 MEDLINE/PubMed U.S. National Library of Medicine (NLM) No 29972246
3029 MEDLINE ProQuest No 29737108
0 oaFindr 1science No 28138247
4232 Europeana The Europeana Foundation No 25536145
1484 Rest of CrossRef collections not specified CrossRef No 25405883
2804 Career & Technical Education Database (Alumni ... ProQuest No 25395059
2981 Hospital Premium Collection ProQuest No 25117492
3452 Technology Research Database ProQuest No 25106821
2982 Hospital Premium Collection (Alumni edition) ProQuest No 22773916
2604 magazineplus 概要 Nichigai Associates No 21242630
2129 *** ALL JSTOR Collections *** JSTOR No 19710321
2731 ABI/INFORM Global (Alumni edition) ProQuest No 18804416
3033 Military Database (Alumni edition) ProQuest No 18651918
3420 Research Library China ProQuest No 16248791
1458 Elsevier CrossRef No 15151457
2737 Accounting, Tax & Banking Collection (Alumni e... ProQuest No 14322416

In [ ]: