English Wikipedia Heading Frequency

This notebook serves to sort English Wikipedia section headers by frequency as related to this research project.


In [1]:
import numpy as np
import pandas as pd

In [2]:
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('enwiki_20161101_headings_2.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)

In [3]:
# concatenate all rows into a pandas dataframe
en_DF = pd.concat([chunk for chunk in tp])

In [6]:
en_DF.head()


Out[6]:
page_id page_title page_ns heading_level heading_text
0 3046527 Bernard Fisher 0 2 People
1 3046527 Bernard Fisher 0 2 Other
2 3046529 Gunpowder Incident 0 2 Background
3 3046529 Gunpowder Incident 0 2 Removing the gunpowder
4 3046529 Gunpowder Incident 0 2 Aftermath

In [7]:
en_DF.page_ns.unique()


Out[7]:
array([0])

In [8]:
# determine number of unique articles
len(en_DF.page_title.unique())


Out[8]:
4947966

In [9]:
# remove leading and trailing whitespace from heading_text column
en_DF['heading_text'] = pd.core.strings.str_strip(en_DF['heading_text'])

In [10]:
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descending order
# this returns a pandas series object
article_count = en_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)

In [11]:
# turn pandas series object into pandas dataframe
en_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})

In [12]:
en_article_count_DF.head()


Out[12]:
number_of_articles section_title
0 4125018 References
1 2338348 External links
2 1134624 See also
3 533444 History
4 283206 Notes

In [13]:
# add a column for the percentage of articles that header appears in
en_article_count_DF['article_percentage'] = (en_article_count_DF['number_of_articles']/5275388)*100

In [14]:
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
en_article_count_DF.round({'article_percentage': 2}).head(100)


Out[14]:
number_of_articles section_title article_percentage
0 4125018 References 78.19
1 2338348 External links 44.33
2 1134624 See also 21.51
3 533444 History 10.11
4 283206 Notes 5.37
5 176458 Career 3.34
6 152442 Biography 2.89
7 148218 Further reading 2.81
8 145087 Track listing 2.75
9 122415 Bibliography 2.32
10 114564 Sources 2.17
11 110000 Early life 2.09
12 109315 Cast 2.07
13 106490 Geography 2.02
14 98892 Personal life 1.87
15 94728 Background 1.80
16 94726 Plot 1.80
17 86593 Reception 1.64
18 80904 Description 1.53
19 79214 Discography 1.50
20 77536 Demographics 1.47
21 76750 Awards 1.45
22 75455 Personnel 1.43
23 63232 Education 1.20
24 58490 Results 1.11
25 56609 Filmography 1.07
26 52229 Life 0.99
27 52182 Honours 0.99
28 48520 Production 0.92
29 47715 Works 0.90
30 44535 Gallery 0.84
31 38929 Overview 0.74
32 38527 Singles 0.73
33 36777 Footnotes 0.70
34 35906 Death 0.68
35 34101 Charts 0.65
36 32771 Distribution 0.62
37 30184 Population 0.57
38 29933 Publications 0.57
39 29678 Legacy 0.56
40 29258 Family 0.55
41 26886 Music 0.51
42 26723 Club career 0.51
43 26523 Early life and education 0.50
44 26453 Television 0.50
45 26328 Species 0.50
46 26249 Events 0.50
47 26221 Final 0.50
48 25783 Books 0.49
49 25289 Critical reception 0.48
50 25235 Albums 0.48
51 25108 Economy 0.48
52 24934 Early years 0.47
53 24572 Location 0.47
54 24463 International career 0.46
55 23888 Schedule 0.45
56 23759 Professional career 0.45
57 22722 Career statistics 0.43
58 22160 Notable people 0.42
59 21110 Etymology 0.40
60 21090 Notes and references 0.40
61 21023 Synopsis 0.40
62 20975 Playing career 0.40
63 20652 Chart performance 0.39
64 19553 Development 0.37
65 19511 Climate 0.37
66 19491 Political career 0.37
67 19444 Transportation 0.37
68 18832 Politics 0.36
69 18714 Other 0.35
70 18473 Release 0.35
71 18408 Film 0.35
72 18214 Life and career 0.35
73 18076 Aftermath 0.34
74 18073 People 0.34
75 17476 Deaths 0.33
76 17385 Soundtrack 0.33
77 17346 Early career 0.33
78 17265 Citations 0.33
79 17092 Transport 0.32
80 16918 Music video 0.32
81 16085 Awards and nominations 0.30
82 15993 Members 0.30
83 15938 Births 0.30
84 15913 Club 0.30
85 15616 Selected filmography 0.30
86 15615 Statistics 0.30
87 15419 In popular culture 0.29
88 15377 Episodes 0.29
89 15269 Literature 0.29
90 15239 Gameplay 0.29
91 15132 Plot summary 0.29
92 15115 Characters 0.29
93 15106 Regular season 0.29
94 14802 Seeds 0.28
95 14574 Places 0.28
96 14315 Notable alumni 0.27
97 14306 Sports 0.27
98 14132 Roster 0.27
99 13856 Release history 0.26