French Wikipedia Heading Frequency

This notebook serves to sort German Wikipedia section headers by frequency as related to this research project.



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
# read in headers file by chunks of 100000 to conserve memory
# https://stackoverflow.com/questions/25962114/how-to-read-a-6-gb-csv-file-with-pandas
tp = pd.read_csv('frwiki_20161101_headings_2.tsv', sep='\t', header=0, dtype={'page_id': np.int32, 'page_title': object, 'page_ns': np.int16, 'heading_level': np.int8, 'heading_text': object}, iterator=True, chunksize=100000)



In [3]:

    
# concatenate all rows into a pandas dataframe
fr_DF = pd.concat([chunk for chunk in tp])



In [4]:

    
fr_DF.head()









    Out[4]:






  
    
      
      page_id
      page_title
      page_ns
      heading_level
      heading_text
    
  
  
    
      0
      412304
      Dragon Head (film)
      0
      2
      Synopsis
    
    
      1
      412304
      Dragon Head (film)
      0
      2
      Fiche technique
    
    
      2
      412304
      Dragon Head (film)
      0
      2
      Distribution
    
    
      3
      412304
      Dragon Head (film)
      0
      2
      Distinctions
    
    
      4
      412304
      Dragon Head (film)
      0
      2
      Voir aussi



In [5]:

    
fr_DF.page_ns.unique()









    Out[5]:





array([0])



In [6]:

    
# determine number of unique articles in this dataset
len(fr_DF['page_title'].unique())









    Out[6]:





1662471



In [7]:

    
# remove leading and trailing whitespace from heading_text column
fr_DF['heading_text'] = pd.core.strings.str_strip(fr_DF['heading_text'])



In [8]:

    
# groupby heading_text and count the number of unique page_titles each heading appears in
# sort in descending order
# this returns a pandas series object
article_count = fr_DF.groupby('heading_text')['page_title'].apply(lambda x: len(x.unique())).sort_values(ascending=False)



In [9]:

    
# turn pandas series object into pandas dataframe
fr_article_count_DF = pd.DataFrame({'section_title':article_count.index, 'number_of_articles':article_count.values})



In [10]:

    
# add a column for the percentage of articles that header appears in
fr_article_count_DF['article_percentage'] = (fr_article_count_DF['number_of_articles']/1809018)*100



In [11]:

    
# set pandas options to display 100 rows
# round percentage to 2 decimal places and show top 100 results
pd.options.display.max_rows = 100
fr_article_count_DF.round({'article_percentage': 2}).head(100)









    Out[11]:






  
    
      
      number_of_articles
      section_title
      article_percentage
    
  
  
    
      0
      793236
      Notes et références
      43.85
    
    
      1
      732207
      Liens externes
      40.48
    
    
      2
      427548
      Voir aussi
      23.63
    
    
      3
      336423
      Références
      18.60
    
    
      4
      328723
      Articles connexes
      18.17
    
    
      5
      289374
      Biographie
      16.00
    
    
      6
      231921
      Histoire
      12.82
    
    
      7
      230700
      Bibliographie
      12.75
    
    
      8
      153257
      Lien externe
      8.47
    
    
      9
      119463
      Géographie
      6.60
    
    
      10
      102788
      Notes
      5.68
    
    
      11
      102284
      Annexes
      5.65
    
    
      12
      97813
      Palmarès
      5.41
    
    
      13
      86004
      Description
      4.75
    
    
      14
      85473
      Historique
      4.72
    
    
      15
      81072
      Distribution
      4.48
    
    
      16
      80643
      Démographie
      4.46
    
    
      17
      79613
      Sources
      4.40
    
    
      18
      65770
      Synopsis
      3.64
    
    
      19
      54188
      Fiche technique
      3.00
    
    
      20
      46953
      Carrière
      2.60
    
    
      21
      46324
      Filmographie
      2.56
    
    
      22
      43431
      Article connexe
      2.40
    
    
      23
      40367
      Liens internes
      2.23
    
    
      24
      39969
      Économie
      2.21
    
    
      25
      37215
      Politique et administration
      2.06
    
    
      26
      37065
      Lieux et monuments
      2.05
    
    
      27
      36600
      Publication originale
      2.02
    
    
      28
      35545
      Toponymie
      1.96
    
    
      29
      35222
      Personnalités liées à la commune
      1.95
    
    
      30
      33932
      Source
      1.88
    
    
      31
      31157
      Œuvres
      1.72
    
    
      32
      28891
      Discographie
      1.60
    
    
      33
      25621
      Répartition
      1.42
    
    
      34
      24583
      Administration
      1.36
    
    
      35
      22934
      Communes limitrophes
      1.27
    
    
      36
      22423
      Étymologie
      1.24
    
    
      37
      21455
      Localisation
      1.19
    
    
      38
      21247
      Statistiques
      1.17
    
    
      39
      20097
      Compléments
      1.11
    
    
      40
      20079
      Publications
      1.11
    
    
      41
      20064
      Distinctions
      1.11
    
    
      42
      18609
      Cinéma
      1.03
    
    
      43
      18173
      Récompenses
      1.00
    
    
      44
      17552
      Galerie
      0.97
    
    
      45
      16409
      Événements
      0.91
    
    
      46
      16214
      Télévision
      0.90
    
    
      47
      16135
      Héraldique
      0.89
    
    
      48
      15906
      Résultats
      0.88
    
    
      49
      15189
      Composition
      0.84
    
    
      50
      14568
      Résumé
      0.81
    
    
      51
      14235
      Présentation
      0.79
    
    
      52
      13338
      Caractéristiques
      0.74
    
    
      53
      13194
      Décès
      0.73
    
    
      54
      12824
      Culture locale et patrimoine
      0.71
    
    
      55
      12022
      Liste des titres
      0.66
    
    
      56
      12001
      Liste des maires
      0.66
    
    
      57
      11783
      Classement
      0.65
    
    
      58
      11713
      Source de la traduction
      0.65
    
    
      59
      11528
      Culture
      0.64
    
    
      60
      11462
      Naissances
      0.63
    
    
      61
      10996
      Situation
      0.61
    
    
      62
      10717
      Accueil
      0.59
    
    
      63
      10519
      Architecture
      0.58
    
    
      64
      10282
      Population et société
      0.57
    
    
      65
      10245
      Patronyme
      0.57
    
    
      66
      10223
      Liste des espèces
      0.57
    
    
      67
      10195
      Référence
      0.56
    
    
      68
      9945
      En club
      0.55
    
    
      69
      9925
      Œuvre
      0.55
    
    
      70
      9703
      Parcours
      0.54
    
    
      71
      9648
      Théâtre
      0.53
    
    
      72
      9551
      Production
      0.53
    
    
      73
      9342
      Jumelages
      0.52
    
    
      74
      9270
      Autres
      0.51
    
    
      75
      9242
      Autour du film
      0.51
    
    
      76
      9107
      Contexte
      0.50
    
    
      77
      9104
      Climat
      0.50
    
    
      78
      8892
      Pour approfondir
      0.49
    
    
      79
      8862
      Personnages
      0.49
    
    
      80
      8571
      Personnalités
      0.47
    
    
      81
      8558
      Enseignement
      0.47
    
    
      82
      8481
      Vie privée
      0.47
    
    
      83
      8423
      Hydrographie
      0.47
    
    
      84
      8351
      Albums
      0.46
    
    
      85
      8240
      Références taxinomiques
      0.46
    
    
      86
      8153
      Évolution démographique
      0.45
    
    
      87
      8078
      Titres
      0.45
    
    
      88
      7963
      Population
      0.44
    
    
      89
      7919
      Musique
      0.44
    
    
      90
      7893
      Compétition
      0.44
    
    
      91
      7837
      [[Hameau italien|Hameaux]]
      0.43
    
    
      92
      7520
      Records
      0.42
    
    
      93
      7434
      Organisation
      0.41
    
    
      94
      7316
      Système de jeu
      0.40
    
    
      95
      7278
      Nominations
      0.40
    
    
      96
      7189
      Famille
      0.40
    
    
      97
      7163
      Ouvrages
      0.40
    
    
      98
      7097
      Origine
      0.39
    
    
      99
      7036
      Toponyme
      0.39

	page_id	page_title	heading_level	heading_text
0	412304	Dragon Head (film)	2	Synopsis
1	412304	Dragon Head (film)	2	Fiche technique
2	412304	Dragon Head (film)	2	Distribution
3	412304	Dragon Head (film)	2	Distinctions
4	412304	Dragon Head (film)	2	Voir aussi

	number_of_articles	section_title	article_percentage
0	793236	Notes et références	43.85
1	732207	Liens externes	40.48
2	427548	Voir aussi	23.63
3	336423	Références	18.60
4	328723	Articles connexes	18.17
5	289374	Biographie	16.00
6	231921	Histoire	12.82
7	230700	Bibliographie	12.75
8	153257	Lien externe	8.47
9	119463	Géographie	6.60
10	102788	Notes	5.68
11	102284	Annexes	5.65
12	97813	Palmarès	5.41
13	86004	Description	4.75
14	85473	Historique	4.72
15	81072	Distribution	4.48
16	80643	Démographie	4.46
17	79613	Sources	4.40
18	65770	Synopsis	3.64
19	54188	Fiche technique	3.00
20	46953	Carrière	2.60
21	46324	Filmographie	2.56
22	43431	Article connexe	2.40
23	40367	Liens internes	2.23
24	39969	Économie	2.21
25	37215	Politique et administration	2.06
26	37065	Lieux et monuments	2.05
27	36600	Publication originale	2.02
28	35545	Toponymie	1.96
29	35222	Personnalités liées à la commune	1.95
30	33932	Source	1.88
31	31157	Œuvres	1.72
32	28891	Discographie	1.60
33	25621	Répartition	1.42
34	24583	Administration	1.36
35	22934	Communes limitrophes	1.27
36	22423	Étymologie	1.24
37	21455	Localisation	1.19
38	21247	Statistiques	1.17
39	20097	Compléments	1.11
40	20079	Publications	1.11
41	20064	Distinctions	1.11
42	18609	Cinéma	1.03
43	18173	Récompenses	1.00
44	17552	Galerie	0.97
45	16409	Événements	0.91
46	16214	Télévision	0.90
47	16135	Héraldique	0.89
48	15906	Résultats	0.88
49	15189	Composition	0.84
50	14568	Résumé	0.81
51	14235	Présentation	0.79
52	13338	Caractéristiques	0.74
53	13194	Décès	0.73
54	12824	Culture locale et patrimoine	0.71
55	12022	Liste des titres	0.66
56	12001	Liste des maires	0.66
57	11783	Classement	0.65
58	11713	Source de la traduction	0.65
59	11528	Culture	0.64
60	11462	Naissances	0.63
61	10996	Situation	0.61
62	10717	Accueil	0.59
63	10519	Architecture	0.58
64	10282	Population et société	0.57
65	10245	Patronyme	0.57
66	10223	Liste des espèces	0.57
67	10195	Référence	0.56
68	9945	En club	0.55
69	9925	Œuvre	0.55
70	9703	Parcours	0.54
71	9648	Théâtre	0.53
72	9551	Production	0.53
73	9342	Jumelages	0.52
74	9270	Autres	0.51
75	9242	Autour du film	0.51
76	9107	Contexte	0.50
77	9104	Climat	0.50
78	8892	Pour approfondir	0.49
79	8862	Personnages	0.49
80	8571	Personnalités	0.47
81	8558	Enseignement	0.47
82	8481	Vie privée	0.47
83	8423	Hydrographie	0.47
84	8351	Albums	0.46
85	8240	Références taxinomiques	0.46
86	8153	Évolution démographique	0.45
87	8078	Titres	0.45
88	7963	Population	0.44
89	7919	Musique	0.44
90	7893	Compétition	0.44
91	7837	[[Hameau italien\|Hameaux]]	0.43
92	7520	Records	0.42
93	7434	Organisation	0.41
94	7316	Système de jeu	0.40
95	7278	Nominations	0.40
96	7189	Famille	0.40
97	7163	Ouvrages	0.40
98	7097	Origine	0.39
99	7036	Toponyme	0.39