English Wikipedia Heading Frequency

This notebook serves to sort English Wikipedia section headers by frequency as related to this research project.



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('enwiki_20161101_headings_2.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)



In [3]:

    
# concatenate all rows into a pandas dataframe
en_DF = pd.concat([chunk for chunk in tp])



In [6]:

    
en_DF.head()









    Out[6]:






  
    
      
      page_id
      page_title
      page_ns
      heading_level
      heading_text
    
  
  
    
      0
      3046527
      Bernard Fisher
      0
      2
      People
    
    
      1
      3046527
      Bernard Fisher
      0
      2
      Other
    
    
      2
      3046529
      Gunpowder Incident
      0
      2
      Background
    
    
      3
      3046529
      Gunpowder Incident
      0
      2
      Removing the gunpowder
    
    
      4
      3046529
      Gunpowder Incident
      0
      2
      Aftermath



In [7]:

    
en_DF.page_ns.unique()









    Out[7]:





array([0])



In [8]:

    
# determine number of unique articles
len(en_DF.page_title.unique())









    Out[8]:





4947966



In [9]:

    
# remove leading and trailing whitespace from heading_text column
en_DF['heading_text'] = pd.core.strings.str_strip(en_DF['heading_text'])



In [10]:

    
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descending order
# this returns a pandas series object
article_count = en_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)



In [11]:

    
# turn pandas series object into pandas dataframe
en_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})



In [12]:

    
en_article_count_DF.head()









    Out[12]:






  
    
      
      number_of_articles
      section_title
    
  
  
    
      0
      4125018
      References
    
    
      1
      2338348
      External links
    
    
      2
      1134624
      See also
    
    
      3
      533444
      History
    
    
      4
      283206
      Notes



In [13]:

    
# add a column for the percentage of articles that header appears in
en_article_count_DF['article_percentage'] = (en_article_count_DF['number_of_articles']/5275388)*100



In [14]:

    
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
en_article_count_DF.round({'article_percentage': 2}).head(100)









    Out[14]:






  
    
      
      number_of_articles
      section_title
      article_percentage
    
  
  
    
      0
      4125018
      References
      78.19
    
    
      1
      2338348
      External links
      44.33
    
    
      2
      1134624
      See also
      21.51
    
    
      3
      533444
      History
      10.11
    
    
      4
      283206
      Notes
      5.37
    
    
      5
      176458
      Career
      3.34
    
    
      6
      152442
      Biography
      2.89
    
    
      7
      148218
      Further reading
      2.81
    
    
      8
      145087
      Track listing
      2.75
    
    
      9
      122415
      Bibliography
      2.32
    
    
      10
      114564
      Sources
      2.17
    
    
      11
      110000
      Early life
      2.09
    
    
      12
      109315
      Cast
      2.07
    
    
      13
      106490
      Geography
      2.02
    
    
      14
      98892
      Personal life
      1.87
    
    
      15
      94728
      Background
      1.80
    
    
      16
      94726
      Plot
      1.80
    
    
      17
      86593
      Reception
      1.64
    
    
      18
      80904
      Description
      1.53
    
    
      19
      79214
      Discography
      1.50
    
    
      20
      77536
      Demographics
      1.47
    
    
      21
      76750
      Awards
      1.45
    
    
      22
      75455
      Personnel
      1.43
    
    
      23
      63232
      Education
      1.20
    
    
      24
      58490
      Results
      1.11
    
    
      25
      56609
      Filmography
      1.07
    
    
      26
      52229
      Life
      0.99
    
    
      27
      52182
      Honours
      0.99
    
    
      28
      48520
      Production
      0.92
    
    
      29
      47715
      Works
      0.90
    
    
      30
      44535
      Gallery
      0.84
    
    
      31
      38929
      Overview
      0.74
    
    
      32
      38527
      Singles
      0.73
    
    
      33
      36777
      Footnotes
      0.70
    
    
      34
      35906
      Death
      0.68
    
    
      35
      34101
      Charts
      0.65
    
    
      36
      32771
      Distribution
      0.62
    
    
      37
      30184
      Population
      0.57
    
    
      38
      29933
      Publications
      0.57
    
    
      39
      29678
      Legacy
      0.56
    
    
      40
      29258
      Family
      0.55
    
    
      41
      26886
      Music
      0.51
    
    
      42
      26723
      Club career
      0.51
    
    
      43
      26523
      Early life and education
      0.50
    
    
      44
      26453
      Television
      0.50
    
    
      45
      26328
      Species
      0.50
    
    
      46
      26249
      Events
      0.50
    
    
      47
      26221
      Final
      0.50
    
    
      48
      25783
      Books
      0.49
    
    
      49
      25289
      Critical reception
      0.48
    
    
      50
      25235
      Albums
      0.48
    
    
      51
      25108
      Economy
      0.48
    
    
      52
      24934
      Early years
      0.47
    
    
      53
      24572
      Location
      0.47
    
    
      54
      24463
      International career
      0.46
    
    
      55
      23888
      Schedule
      0.45
    
    
      56
      23759
      Professional career
      0.45
    
    
      57
      22722
      Career statistics
      0.43
    
    
      58
      22160
      Notable people
      0.42
    
    
      59
      21110
      Etymology
      0.40
    
    
      60
      21090
      Notes and references
      0.40
    
    
      61
      21023
      Synopsis
      0.40
    
    
      62
      20975
      Playing career
      0.40
    
    
      63
      20652
      Chart performance
      0.39
    
    
      64
      19553
      Development
      0.37
    
    
      65
      19511
      Climate
      0.37
    
    
      66
      19491
      Political career
      0.37
    
    
      67
      19444
      Transportation
      0.37
    
    
      68
      18832
      Politics
      0.36
    
    
      69
      18714
      Other
      0.35
    
    
      70
      18473
      Release
      0.35
    
    
      71
      18408
      Film
      0.35
    
    
      72
      18214
      Life and career
      0.35
    
    
      73
      18076
      Aftermath
      0.34
    
    
      74
      18073
      People
      0.34
    
    
      75
      17476
      Deaths
      0.33
    
    
      76
      17385
      Soundtrack
      0.33
    
    
      77
      17346
      Early career
      0.33
    
    
      78
      17265
      Citations
      0.33
    
    
      79
      17092
      Transport
      0.32
    
    
      80
      16918
      Music video
      0.32
    
    
      81
      16085
      Awards and nominations
      0.30
    
    
      82
      15993
      Members
      0.30
    
    
      83
      15938
      Births
      0.30
    
    
      84
      15913
      Club
      0.30
    
    
      85
      15616
      Selected filmography
      0.30
    
    
      86
      15615
      Statistics
      0.30
    
    
      87
      15419
      In popular culture
      0.29
    
    
      88
      15377
      Episodes
      0.29
    
    
      89
      15269
      Literature
      0.29
    
    
      90
      15239
      Gameplay
      0.29
    
    
      91
      15132
      Plot summary
      0.29
    
    
      92
      15115
      Characters
      0.29
    
    
      93
      15106
      Regular season
      0.29
    
    
      94
      14802
      Seeds
      0.28
    
    
      95
      14574
      Places
      0.28
    
    
      96
      14315
      Notable alumni
      0.27
    
    
      97
      14306
      Sports
      0.27
    
    
      98
      14132
      Roster
      0.27
    
    
      99
      13856
      Release history
      0.26

	page_id	page_title	heading_level	heading_text
0	3046527	Bernard Fisher	2	People
1	3046527	Bernard Fisher	2	Other
2	3046529	Gunpowder Incident	2	Background
3	3046529	Gunpowder Incident	2	Removing the gunpowder
4	3046529	Gunpowder Incident	2	Aftermath

	number_of_articles	section_title
0	4125018	References
1	2338348	External links
2	1134624	See also
3	533444	History
4	283206	Notes

	number_of_articles	section_title	article_percentage
0	4125018	References	78.19
1	2338348	External links	44.33
2	1134624	See also	21.51
3	533444	History	10.11
4	283206	Notes	5.37
5	176458	Career	3.34
6	152442	Biography	2.89
7	148218	Further reading	2.81
8	145087	Track listing	2.75
9	122415	Bibliography	2.32
10	114564	Sources	2.17
11	110000	Early life	2.09
12	109315	Cast	2.07
13	106490	Geography	2.02
14	98892	Personal life	1.87
15	94728	Background	1.80
16	94726	Plot	1.80
17	86593	Reception	1.64
18	80904	Description	1.53
19	79214	Discography	1.50
20	77536	Demographics	1.47
21	76750	Awards	1.45
22	75455	Personnel	1.43
23	63232	Education	1.20
24	58490	Results	1.11
25	56609	Filmography	1.07
26	52229	Life	0.99
27	52182	Honours	0.99
28	48520	Production	0.92
29	47715	Works	0.90
30	44535	Gallery	0.84
31	38929	Overview	0.74
32	38527	Singles	0.73
33	36777	Footnotes	0.70
34	35906	Death	0.68
35	34101	Charts	0.65
36	32771	Distribution	0.62
37	30184	Population	0.57
38	29933	Publications	0.57
39	29678	Legacy	0.56
40	29258	Family	0.55
41	26886	Music	0.51
42	26723	Club career	0.51
43	26523	Early life and education	0.50
44	26453	Television	0.50
45	26328	Species	0.50
46	26249	Events	0.50
47	26221	Final	0.50
48	25783	Books	0.49
49	25289	Critical reception	0.48
50	25235	Albums	0.48
51	25108	Economy	0.48
52	24934	Early years	0.47
53	24572	Location	0.47
54	24463	International career	0.46
55	23888	Schedule	0.45
56	23759	Professional career	0.45
57	22722	Career statistics	0.43
58	22160	Notable people	0.42
59	21110	Etymology	0.40
60	21090	Notes and references	0.40
61	21023	Synopsis	0.40
62	20975	Playing career	0.40
63	20652	Chart performance	0.39
64	19553	Development	0.37
65	19511	Climate	0.37
66	19491	Political career	0.37
67	19444	Transportation	0.37
68	18832	Politics	0.36
69	18714	Other	0.35
70	18473	Release	0.35
71	18408	Film	0.35
72	18214	Life and career	0.35
73	18076	Aftermath	0.34
74	18073	People	0.34
75	17476	Deaths	0.33
76	17385	Soundtrack	0.33
77	17346	Early career	0.33
78	17265	Citations	0.33
79	17092	Transport	0.32
80	16918	Music video	0.32
81	16085	Awards and nominations	0.30
82	15993	Members	0.30
83	15938	Births	0.30
84	15913	Club	0.30
85	15616	Selected filmography	0.30
86	15615	Statistics	0.30
87	15419	In popular culture	0.29
88	15377	Episodes	0.29
89	15269	Literature	0.29
90	15239	Gameplay	0.29
91	15132	Plot summary	0.29
92	15115	Characters	0.29
93	15106	Regular season	0.29
94	14802	Seeds	0.28
95	14574	Places	0.28
96	14315	Notable alumni	0.27
97	14306	Sports	0.27
98	14132	Roster	0.27
99	13856	Release history	0.26