This notebook was put together by [Roman Prokofyev](http://prokofyev.ch)@[eXascale Infolab](http://exascale.info/). Source and license info is on [GitHub](https://github.com/dragoon/kilogram/).

Prerequisites

Pandas: pip install pandas
Matplotlib



In [1]:

    
import matplotlib.pyplot as plt
from mpltools import style
import numpy as np
style.use('ggplot')
%matplotlib inline
import pandas as pd
import shelve
from collections import defaultdict

Construct original counts file



In [2]:

    
count_dict = {}
for line in open('../mapreduce/predicted_label_counts.txt'):
    uri, label, values = line.split('\t')
    upper_count, lower_count = values.split(',')
    count_dict[(uri, label)] = {'infer_normal': int(upper_count), 'infer_lower': int(lower_count), 'len': len(label.split('_')),
                       'label': label, 'organ_normal': 0, 'organ_lower': 0, 'uri': uri}
for line in open('../mapreduce/organic_label_counts.txt'):
    uri, label, values = line.split('\t')
    if (uri, label) in count_dict:
        upper_count, lower_count = values.split(',')
        count_dict[(uri, label)].update({'organ_normal': int(upper_count), 'organ_lower': int(lower_count)})
counts_df = pd.DataFrame(count_dict.values())
del count_dict
counts_df.head()









    Out[2]:






  
    
      
      infer_lower
      infer_normal
      label
      len
      organ_lower
      organ_normal
      uri
    
  
  
    
      0
       0
       12
               Toni Negri
       1
       0
       13
                 Antonio_Negri
    
    
      1
       0
       38
               Mike Groff
       1
       0
       40
                    Mike_Groff
    
    
      2
       0
       19
             Zigzag River
       1
       0
        7
                  Zigzag_River
    
    
      3
       0
        1
       St Francis Rangers
       1
       0
        1
       St_Francis_Rangers_F.C.
    
    
      4
       0
        6
             Semiha Yankı
       1
       0
        5
                  Semiha_Yankı

Generate excludes by ambiguity



In [22]:

    
from __future__ import division
"""
We never exclude uppercase labels since we don't match at the beginning of a sentence
"""
includes = open('../mapreduce/unambiguous_labels.txt', 'w')
for row in counts_df.iterrows():
    row = row[1]
    exclude = False
    label = row['label']
    uri = row['uri']
    
    # skip uppercase
    if label.isupper():
        includes.write(label+'\t'+uri+'\n')
        continue
    # if label appears only in lowercase - add to lower includes
    if row['organ_normal'] == 0:  # means label is lowercase
        if row['organ_lower'] > 1:
            includes.write(label+'\t'+uri+'\n')
        continue
    else:
        infer_ratio = row['infer_normal']/(row['infer_lower'] or 1)
        orig_ratio = row['organ_normal']/(row['organ_lower'] or 1)
        if infer_ratio == 0:
            # weird label, p. ex. 中华人民共和国
            continue
        # always write a normal-case label
        includes.write(label+'\t'+uri+'\n')
        if orig_ratio/infer_ratio < 2 and row['infer_lower'] > 0:
            includes.write(label.lower()+'\t'+uri+'\n')
includes.close()

Generate typed n-grams

hdfs dfs -cat /user/roman/wikipedia_ngrams/* | python spark_typed_ngrams_from_plain.py > typed_ngrams.txt
hdfs dfs -put typed_ngrams.txt /user/roman/wikipedia_typed_ngrams/

Hbase-suitable format:

./run_job.py -m ./type_prediction/mapper.py -r ./type_prediction/reducer.py "/user/roman/wikipedia_typed_ngrams" /user/roman/hbase_wikipedia_typed_ngrams

Put into Hbase:

pig -p table=typogram -p path=/user/roman/hbase_wikipedia_typed_ngrams ../extra/hbase_upload_array.pig



In [3]:

    
counts_df[(counts_df.uri == 'Cicada')]









    Out[3]:






  
    
      
      infer_lower
      infer_normal
      label
      len
      organ_lower
      organ_normal
      uri
    
  
  
    
      253275 
         1
        0
       chicharras
       1
         1
        0
       Cicada
    
    
      506026 
         0
       24
        Cicadidae
       1
         1
       19
       Cicada
    
    
      972905 
       325
        0
           cicada
       1
       118
        0
       Cicada
    
    
      1190857
       401
       28
          Cicadas
       1
       163
        6
       Cicada
    
    
      2219444
         0
        2
       Cicadoidea
       1
         0
        2
       Cicada



In [19]:

    
counts_df[(counts_df.organ_normal > 0) & (counts_df.infer_lower > 0) & (counts_df.infer_normal == 0)]









    Out[19]:






  
    
      
      infer_lower
      infer_normal
      label
      len
      organ_lower
      organ_normal
      uri
    
  
  
    
      2333   
         24
       0
            12/50
       1
       0
        1
                                     Alvis_12/50
    
    
      3758   
        278
       0
        1977-1988
       1
       0
        1
                             Operation_Fair_Play
    
    
      8528   
          5
       0
            54.40
       1
       0
        1
                                           54-40
    
    
      9245   
          4
       0
          中华人民共和国
       1
       0
        1
                                           China
    
    
      9371   
          5
       0
             £194
       1
       0
        1
                                  Pound_sterling
    
    
      10012  
         69
       0
        1741-1743
       1
       0
        1
                     Russo-Swedish_War_(1741–43)
    
    
      12374  
         53
       0
             $139
       1
       0
        1
                                 Canadian_dollar
    
    
      13204  
         35
       0
             -900
       1
       0
        2
                                 Airbus_A350_XWB
    
    
      14697  
          9
       0
             £827
       1
       0
        1
                                  Pound_sterling
    
    
      16496  
          4
       0
             #520
       1
       0
        1
                                         PRR_520
    
    
      18147  
          1
       0
         亜麻色の髪の乙女
       1
       0
        1
                         Amairo_no_Kami_no_Otome
    
    
      19004  
         11
       0
             €199
       1
       0
        1
                                            Euro
    
    
      19124  
          6
       0
              ₤15
       1
       0
        3
                                  Pound_sterling
    
    
      24809  
          2
       0
              梅兰芳
       1
       0
        1
                                     Mei_Lanfang
    
    
      28067  
          2
       0
              ₩28
       1
       0
        1
                                South_Korean_won
    
    
      28358  
        172
       0
        1787-1792
       1
       0
        2
                     Russo-Turkish_War_(1787–92)
    
    
      29531  
         23
       0
             £128
       1
       0
        1
                                  Pound_sterling
    
    
      30234  
          2
       0
            €5.00
       1
       0
        1
                                            Euro
    
    
      30670  
          2
       0
              /27
       1
       0
        1
                                     Nieuport_27
    
    
      31738  
         12
       0
             16/9
       1
       0
        6
                                    16/9_(album)
    
    
      33552  
         82
       0
        1971-1976
       1
       0
        1
                           Leo_Kottke:_1971–1976
    
    
      34078  
          8
       0
           4×4=12
       1
       0
        6
                                          4×4=12
    
    
      37000  
          1
       0
          آش رشته
       1
       0
        1
                                     Ash_reshteh
    
    
      39911  
          9
       0
         11001001
       1
       0
        7
                                        11001001
    
    
      41628  
        753
       0
         £250,000
       1
       0
        3
                                  Pound_sterling
    
    
      42278  
        123
       0
              $79
       1
       0
        3
                                 Canadian_dollar
    
    
      43409  
         35
       0
        1594-1603
       1
       0
        1
                       Nine_Years'_War_(Ireland)
    
    
      43926  
          1
       0
        1577-1582
       1
       0
        1
            Livonian_campaign_of_Stephen_Báthory
    
    
      46156  
         26
       0
             3/14
       1
       0
        1
                                          Pi_Day
    
    
      47326  
          5
       0
             €245
       1
       0
        1
                                            Euro
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2209632
        666
       0
          £15,000
       1
       0
        2
                                  Pound_sterling
    
    
      2211533
       4239
       0
              2.6
       1
       0
        1
                                    Linux_kernel
    
    
      2216687
         54
       0
              '32
       1
       0
        1
       United_States_presidential_election,_1932
    
    
      2217112
         11
       0
        1667-1668
       1
       0
        1
                               War_of_Devolution
    
    
      2217481
          6
       0
                毛
       1
       0
        1
                                   Mao_(surname)
    
    
      2217606
          1
       0
             €985
       1
       0
        1
                                            Euro
    
    
      2217965
        455
       0
            50:50
       1
       0
        1
                               50–50_(game_show)
    
    
      2220972
        180
       0
          1987-89
       1
       0
        2
                        1987–89_JVP_Insurrection
    
    
      2221919
         17
       0
             £620
       1
       0
        2
                                  Pound_sterling
    
    
      2222612
        104
       0
          1988-90
       1
       0
        1
                        1987–89_JVP_Insurrection
    
    
      2223286
         12
       0
           48-215
       1
       0
        7
                                   Holden_48-215
    
    
      2225050
         64
       0
            4.2.2
       1
       0
        1
                              Android_Jelly_Bean
    
    
      2225834
          4
       0
          4 3 2 1
       1
       0
        1
                          4,_3,_2,_1_(k-os_song)
    
    
      2228050
         14
       0
             .07%
       1
       0
       12
                                            .07%
    
    
      2229407
         51
       0
            5"/38
       1
       0
        2
                               5"/38_caliber_gun
    
    
      2233715
          3
       0
             ¥130
       1
       0
        2
                                    Japanese_yen
    
    
      2236966
          6
       0
            35007
       1
       0
        3
                                           35007
    
    
      2239875
        813
       0
             93.3
       1
       0
        1
                                            KIOA
    
    
      2240488
         39
       0
              £58
       1
       0
        3
                                  Pound_sterling
    
    
      2241074
          7
       0
          860-880
       1
       0
        1
             860–880_Lake_Shore_Drive_Apartments
    
    
      2247979
          1
       0
               ₹1
       1
       0
        1
                                    Indian_rupee
    
    
      2248444
          1
       0
            21799
       1
       0
        1
                                 21799_Ciociaria
    
    
      2248695
        256
       0
       $3,000,000
       1
       0
        2
            Confederate_States_of_America_dollar
    
    
      2250649
         67
       0
              '00
       1
       0
        1
                                     '00_(album)
    
    
      2251127
        796
       0
             95.5
       1
       0
        1
                                            WBOP
    
    
      2253383
          1
       0
             £163
       1
       0
        1
                                  Pound_sterling
    
    
      2254253
         14
       0
             €510
       1
       0
        1
                                            Euro
    
    
      2256352
          3
       0
             ¥160
       1
       0
        1
                                    Japanese_yen
    
    
      2259940
          3
       0
            6.0.7
       1
       0
        2
                                        System_6
    
    
      2262078
          1
       0
             ₩420
       1
       0
        3
                                South_Korean_won
    
  

1272 rows × 7 columns



In [ ]:

	infer_normal	label	len	organ_normal	uri
0	12	Toni Negri	1	13	Antonio_Negri
1	38	Mike Groff	1	40	Mike_Groff
2	19	Zigzag River	1	7	Zigzag_River
3	1	St Francis Rangers	1	1	St_Francis_Rangers_F.C.
4	6	Semiha Yankı	1	5	Semiha_Yankı

	infer_lower	infer_normal	label	len	organ_lower	organ_normal	uri
253275	1	0	chicharras	1	1	0	Cicada
506026	0	24	Cicadidae	1	1	19	Cicada
972905	325	0	cicada	1	118	0	Cicada
1190857	401	28	Cicadas	1	163	6	Cicada
2219444	0	2	Cicadoidea	1	0	2	Cicada

	infer_lower	infer_normal	label	len	organ_lower	organ_normal	uri
2333	24	0	12/50	1	0	1	Alvis_12/50
3758	278	0	1977-1988	1	0	1	Operation_Fair_Play
8528	5	0	54.40	1	0	1	54-40
9245	4	0	中华人民共和国	1	0	1	China
9371	5	0	£194	1	0	1	Pound_sterling
10012	69	0	1741-1743	1	0	1	Russo-Swedish_War_(1741–43)
12374	53	0	$139	1	0	1	Canadian_dollar
13204	35	0	-900	1	0	2	Airbus_A350_XWB
14697	9	0	£827	1	0	1	Pound_sterling
16496	4	0	#520	1	0	1	PRR_520
18147	1	0	亜麻色の髪の乙女	1	0	1	Amairo_no_Kami_no_Otome
19004	11	0	€199	1	0	1	Euro
19124	6	0	₤15	1	0	3	Pound_sterling
24809	2	0	梅兰芳	1	0	1	Mei_Lanfang
28067	2	0	₩28	1	0	1	South_Korean_won
28358	172	0	1787-1792	1	0	2	Russo-Turkish_War_(1787–92)
29531	23	0	£128	1	0	1	Pound_sterling
30234	2	0	€5.00	1	0	1	Euro
30670	2	0	/27	1	0	1	Nieuport_27
31738	12	0	16/9	1	0	6	16/9_(album)
33552	82	0	1971-1976	1	0	1	Leo_Kottke:_1971–1976
34078	8	0	4×4=12	1	0	6	4×4=12
37000	1	0	آش رشته	1	0	1	Ash_reshteh
39911	9	0	11001001	1	0	7	11001001
41628	753	0	£250,000	1	0	3	Pound_sterling
42278	123	0	$79	1	0	3	Canadian_dollar
43409	35	0	1594-1603	1	0	1	Nine_Years'_War_(Ireland)
43926	1	0	1577-1582	1	0	1	Livonian_campaign_of_Stephen_Báthory
46156	26	0	3/14	1	0	1	Pi_Day
47326	5	0	€245	1	0	1	Euro
...	...	...	...	...	...	...	...
2209632	666	0	£15,000	1	0	2	Pound_sterling
2211533	4239	0	2.6	1	0	1	Linux_kernel
2216687	54	0	'32	1	0	1	United_States_presidential_election,_1932
2217112	11	0	1667-1668	1	0	1	War_of_Devolution
2217481	6	0	毛	1	0	1	Mao_(surname)
2217606	1	0	€985	1	0	1	Euro
2217965	455	0	50:50	1	0	1	50–50_(game_show)
2220972	180	0	1987-89	1	0	2	1987–89_JVP_Insurrection
2221919	17	0	£620	1	0	2	Pound_sterling
2222612	104	0	1988-90	1	0	1	1987–89_JVP_Insurrection
2223286	12	0	48-215	1	0	7	Holden_48-215
2225050	64	0	4.2.2	1	0	1	Android_Jelly_Bean
2225834	4	0	4 3 2 1	1	0	1	4,_3,_2,_1_(k-os_song)
2228050	14	0	.07%	1	0	12	.07%
2229407	51	0	5"/38	1	0	2	5"/38_caliber_gun
2233715	3	0	¥130	1	0	2	Japanese_yen
2236966	6	0	35007	1	0	3	35007
2239875	813	0	93.3	1	0	1	KIOA
2240488	39	0	£58	1	0	3	Pound_sterling
2241074	7	0	860-880	1	0	1	860–880_Lake_Shore_Drive_Apartments
2247979	1	0	₹1	1	0	1	Indian_rupee
2248444	1	0	21799	1	0	1	21799_Ciociaria
2248695	256	0	$3,000,000	1	0	2	Confederate_States_of_America_dollar
2250649	67	0	'00	1	0	1	'00_(album)
2251127	796	0	95.5	1	0	1	WBOP
2253383	1	0	£163	1	0	1	Pound_sterling
2254253	14	0	€510	1	0	1	Euro
2256352	3	0	¥160	1	0	1	Japanese_yen
2259940	3	0	6.0.7	1	0	2	System_6
2262078	1	0	₩420	1	0	3	South_Korean_won