Analyzing Syllabi


In [43]:
import matplotlib
import pandas as pd
import seaborn
%matplotlib inline

In [336]:
df = pd.read_csv("disciplines-per-course.csv")
df.head()


Out[336]:
discipline course school program aggregates
0 cs berkeley-info202 Berkeley MIMS Computer Science
1 news berkeley-info202 Berkeley MIMS News
2 psych berkeley-info202 Berkeley MIMS Social Sciences
3 med berkeley-info202 Berkeley MIMS Science
4 cs berkeley-info202 Berkeley MIMS Computer Science

In [337]:
disciplinary_breakdown_by_course = df.groupby(["school","program","aggregates"]).size()
disciplinary_breakdown_by_course.head()


Out[337]:
school    program  aggregates         
Berkeley  MIMS     Computer Science       6
                   HCI                    3
                   Humanities             4
                   Information Science    1
                   Law                    3
dtype: int64

In [338]:
ax = disciplinary_breakdown_by_course.unstack(level=2).plot(kind="barh", 
                                                            stacked=True, 
                                                            figsize=(11,8), 
                                                            colormap="Set2",
                                                            )
fig = ax.get_figure()
fig.tight_layout()
fig.savefig("raw_discipline_breakdown.pdf")


proportions per course


In [339]:
raw_course_discipline_counts = df.groupby(["course","aggregates"]).size()
raw_course_discipline_counts.head()


Out[339]:
course            aggregates         
berkeley-info202  Computer Science       3
                  Information Science    1
                  Management             1
                  News                   6
                  Science                2
dtype: int64

In [340]:
discipline_proportion_per_course = raw_course_discipline_counts.div(raw_course_discipline_counts.sum(level="course"), level="course")*100

In [341]:
discipline_proportion_per_course.unstack(level=1).fillna(0).to_csv("discipline_proportion_per_course.csv")

Proportions per program


In [342]:
raw_programs_discipline_counts = df.groupby(["school","program","aggregates"]).size()
raw_programs_discipline_counts


Out[342]:
school         program  aggregates         
Berkeley       MIMS     Computer Science        6
                        HCI                     3
                        Humanities              4
                        Information Science     1
                        Law                     3
                        Management              2
                        Misc                    2
                        News                   11
                        Science                 3
                        Social Sciences         6
Drexel         MSLIS    Information Science     2
                        LIS                    12
                        Management              3
Florida State  MSLIS    Information Science     2
                        LIS                     7
                        Law                     1
Georgia Tech   MSHCI    Information Science     2
                        News                    1
Indiana        MIS      Computer Science        1
                        HCI                    11
                        Information Science    43
                        LIS                     4
                        Management             17
                        Misc                    2
                        Science                 3
                        Social Sciences         1
               MLS      Computer Science        6
                        HCI                     3
                        Information Science    18
                        LIS                    23
                                               ..
Syracuse       MSLIS    Science                 1
                        Social Sciences         1
Texas Austin   MSIS     Archives                4
                        Computer Science        1
                        HCI                     4
                        Humanities             14
                        Information Science    18
                        LIS                     8
                        Management              6
                        Misc                    2
                        News                    2
                        Science                 4
                        Social Sciences        14
UCLA           MLIS     Archives                1
                        Computer Science        9
                        HCI                     4
                        Humanities              8
                        Information Science     8
                        LIS                    19
                        Law                     2
                        Misc                    4
                        News                    7
                        Science                 7
                        Social Sciences         6
UIUC           MSLIS    Computer Science        1
                        Humanities              2
                        Information Science    14
                        LIS                    42
                        Misc                    1
                        News                    3
dtype: int64

In [343]:
total_program_counts = raw_programs_discipline_counts.groupby(level=['school','program']).sum()
total_program_counts


Out[343]:
school          program
Berkeley        MIMS        41
Drexel          MSLIS       17
Florida State   MSLIS       10
Georgia Tech    MSHCI        3
Indiana         MIS         82
                MLS         65
Irvine          MSICS       33
Maryland        MIM         19
                MLS         77
                MSHCI       19
Michigan        MSI         12
North Carolina  MSIS        56
                MSLS       161
Rutgers         MI          95
Syracuse        MSLIS       17
Texas Austin    MSIS        77
UCLA            MLIS        75
UIUC            MSLIS       63
dtype: int64

In [344]:
proportion_per_program = raw_programs_discipline_counts.unstack().div(total_program_counts, axis=0) * 100

In [ ]:


In [347]:
ax = proportion_per_program.plot(kind="barh", 
                                 stacked=True, 
                                 figsize=(11,8), 
                                 colormap="Set3",
                                 )

ax.set_xlim(0,100)
ax.legend(bbox_to_anchor=(1.31, .85))

fig = ax.get_figure()
fig.tight_layout(pad=12)
fig.savefig("proportion_per_program.pdf")



In [355]:
proportion_per_program


Out[355]:
aggregates Archives Computer Science HCI Humanities Information Science LIS Law Management Misc News Science Social Sciences
school program
Berkeley MIMS NaN 14.634146 7.317073 9.756098 2.439024 NaN 7.317073 4.878049 4.878049 26.829268 7.317073 14.634146
Drexel MSLIS NaN NaN NaN NaN 11.764706 70.588235 NaN 17.647059 NaN NaN NaN NaN
Florida State MSLIS NaN NaN NaN NaN 20.000000 70.000000 10.000000 NaN NaN NaN NaN NaN
Georgia Tech MSHCI NaN NaN NaN NaN 66.666667 NaN NaN NaN NaN 33.333333 NaN NaN
Indiana MIS NaN 1.219512 13.414634 NaN 52.439024 4.878049 NaN 20.731707 2.439024 NaN 3.658537 1.219512
MLS NaN 9.230769 4.615385 NaN 27.692308 35.384615 NaN 7.692308 1.538462 6.153846 1.538462 6.153846
Irvine MSICS NaN 3.030303 48.484848 15.151515 NaN NaN 6.060606 6.060606 3.030303 9.090909 3.030303 6.060606
Maryland MIM NaN NaN 10.526316 5.263158 15.789474 NaN NaN 52.631579 5.263158 NaN 5.263158 5.263158
MLS 1.298701 1.298701 2.597403 NaN 15.584416 68.831169 NaN 3.896104 NaN 2.597403 3.896104 NaN
MSHCI NaN NaN 78.947368 NaN NaN 15.789474 NaN NaN NaN 5.263158 NaN NaN
Michigan MSI NaN NaN NaN NaN 33.333333 16.666667 NaN NaN 16.666667 8.333333 8.333333 16.666667
North Carolina MSIS 1.785714 NaN 7.142857 1.785714 12.500000 21.428571 NaN 3.571429 10.714286 12.500000 8.928571 19.642857
MSLS 1.242236 1.863354 1.242236 1.863354 36.024845 36.024845 NaN 1.863354 4.347826 3.105590 4.347826 8.074534
Rutgers MI NaN 1.052632 NaN 2.105263 33.684211 33.684211 NaN 11.578947 5.263158 5.263158 1.052632 6.315789
Syracuse MSLIS NaN NaN NaN NaN 5.882353 47.058824 5.882353 23.529412 NaN 5.882353 5.882353 5.882353
Texas Austin MSIS 5.194805 1.298701 5.194805 18.181818 23.376623 10.389610 NaN 7.792208 2.597403 2.597403 5.194805 18.181818
UCLA MLIS 1.333333 12.000000 5.333333 10.666667 10.666667 25.333333 2.666667 NaN 5.333333 9.333333 9.333333 8.000000
UIUC MSLIS NaN 1.587302 NaN 3.174603 22.222222 66.666667 NaN NaN 1.587302 4.761905 NaN NaN

In [356]:
proportion_per_program.fillna(0).to_csv("proportion_per_program.csv")

Entropy as a measure of complexity


In [357]:
from scipy.stats import entropy

In [368]:
core_complexity = proportion_per_program.fillna(0).apply(entropy, axis=1)

In [379]:
ax = core_complexity.sort(ascending=True, inplace=False).plot(kind="barh",
                                                        figsize=(11,8))
fig = ax.get_figure()
fig.tight_layout()
fig.savefig("complexity_per_program.pdf")



In [380]:
core_complexity


Out[380]:
school          program
Maryland        MSHCI      0.633040
Georgia Tech    MSHCI      0.636514
Florida State   MSLIS      0.801819
Drexel          MSLIS      0.803742
UIUC            MSLIS      0.990578
Maryland        MLS        1.102135
Indiana         MIS        1.400617
Maryland        MIM        1.486124
Syracuse        MSLIS      1.528466
Rutgers         MI         1.644243
North Carolina  MSLS       1.650934
Michigan        MSI        1.676235
Irvine          MSICS      1.682467
Indiana         MLS        1.753972
North Carolina  MSIS       2.075929
Texas Austin    MSIS       2.100760
Berkeley        MIMS       2.101786
UCLA            MLIS       2.191350
dtype: float64

In [381]:
proportion_per_program['diversity'] = core_complexity

In [384]:
proportion_per_program_ordered = proportion_per_program.sort(columns="diversity", inplace=False)

In [388]:
proportion_per_program_ordered.drop("diversity", axis=1, inplace=True)

In [523]:
ax = proportion_per_program_ordered.plot(kind="barh", 
                                 stacked=True, 
                                 figsize=(11,8), 
                                 colormap="Set3",
                                 )

ax.set_xlim(0,100)
ax.legend(bbox_to_anchor=(1.31, .85))

fig = ax.get_figure()
ax.set_title("Disciplinary porportions per Program")
ax.set_xlabel('Percentage')
ax.set_ylabel('School-Program')

fig.tight_layout(pad=12)
#fig.savefig("proportion_per_program.png", dpi=300)
fig.savefig("proportion_per_program.pdf")


Topic areas across the ischools


In [400]:
ischool_discipline_proportions = raw_programs_discipline_counts.groupby(level="aggregates").sum().div(raw_programs_discipline_counts.sum(),level='aggregates') * 100

In [503]:
ischool_discipline_proportions.sort(inplace=False)


Out[503]:
aggregates
Archives                0.976139
Law                     0.976139
Computer Science        3.253796
Misc                    3.687636
Science                 4.121475
Humanities              4.338395
News                    5.748373
HCI                     7.158351
Social Sciences         7.266811
Management              7.375271
Information Science    24.403471
LIS                    30.694143
dtype: float64

In [506]:
ax = ischool_discipline_proportions.plot(kind="pie",
                                         stacked=True,
                                         figsize=(20,20),
                                         colormap="Set3",
                                         label='',
                                         fontsize=32,
                                         autopct='%.1f')
fig = ax.get_figure()
fig.tight_layout(pad=12)

ax.set_title("Interdisciplinarity Across Syllabi", fontsize=36)



fig.savefig("proportion_all_ischools.png")


Graphing overlap


In [428]:
cleaned_citations = pd.read_csv("cleaned_cites.csv")

In [464]:
journal_articles = cleaned_citations[cleaned_citations['type'] == 'article']
top_journal_articles = journal_articles.groupby('title').size().sort(inplace=False, ascending=False)

In [519]:
ax = top_journal_articles.plot(kind="line",
                               figsize=(11,4),
                               use_index=False,
                               )

ax.set_xlim(0,100)
ax.set_ylim(0,7)
ax.set_title("Distribution of Popular Articles across Syllabi")
ax.set_xlabel('Article Popularity Ranking')
ax.set_ylabel('Number of Syllabi')
fig = ax.get_figure()
fig.tight_layout()
fig.savefig("popular-articles.pdf")



In [473]:
top_journal_articles.head(15)


Out[473]:
title
As We May Think                                                                                        6
Information as Thing                                                                                   6
The invisible substrate of information science                                                         4
The design of browsing and berry-picking techniques for online search interface                        4
Metadata for all: Descriptive standards and metadata sharing across libraries, archives and museums    4
Question negotiation and information seeking in libraries                                              3
Inside the search process: Information seeking from the user's perspective                             3
What is a “Document”?                                                                                  3
The Reference Interview: Theories and Practice                                                         2
On the Web at home: Information seeking and Web searching in the home environment                      2
The concept of situation in information science                                                        2
The Online Library Catalog: Paradise Lost and Paradise Regained?                                       2
What is a collection?                                                                                  2
Social Q&A and virtual reference ­­ Comparing apples and oranges with the help of experts and users    2
Oranges and peaches: Understanding communication accidents in the reference interview                  2
dtype: int64

top journal titles


In [524]:
top_journal_titles = cleaned_citations[cleaned_citations['type'] == 'article'].groupby('journal').size().sort(inplace=False, ascending=False)
top_journal_titles.head(30)


Out[524]:
journal
Journal of the American Society for Information Science & Technology    57
Journal of the American Society for Information Science                 38
Journal of Documentation                                                28
Information Processing & Management                                     25
Library Trends                                                          21
College & Research Libraries                                            19
Harvard Business Review                                                 19
Interactions                                                            18
Journal of Information Science                                          17
Journal of Academic Librarianship                                       16
Cataloging & Classification Quarterly                                   14
Library & Information Science Research                                  14
Journal of Library Administration                                       14
Communications of the ACM                                               11
Scientific American                                                     10
Information Research                                                     9
Information Technology & Libraries                                       9
Reference & User Services Quarterly                                      8
Library Hi Tech                                                          8
Library Administration & Management                                      8
New York Times                                                           7
The Atlantic Monthly                                                     7
D-Lib Magazine                                                           7
The Reference Librarian                                                  7
Library Resources & Technical Services                                   7
First Monday                                                             6
Searcher                                                                 6
American Psychologist                                                    6
The Information Society                                                  6
Library Journal                                                          5
dtype: int64

In [484]:
top_journal_titles.describe()


Out[484]:
count    324.000000
mean       2.722222
std        5.064598
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max       57.000000
dtype: float64

In [517]:
ax = top_journal_titles.plot(kind="line",
                               figsize=(11,4),
                               use_index=False,
                               )

#ax.set_xlim(0,100)
#ax.set_ylim(0,7)
ax.set_title("Popularity Distribution of Journal Titles")
ax.set_xlabel('Journal Popularity Ranking')
ax.set_ylabel('Number of Citations')
fig = ax.get_figure()
fig.tight_layout()
fig.savefig("popular-journals.pdf")



In [ ]: