notebook.community

Edit and run



In [1]:

    
import pandas as pd



In [2]:

    
df = pd.read_csv("PARSED_STATES.txt", sep="\t")
df.parsed_state = df.parsed_state.str.upper()
df_states = pd.read_csv("US_STATES.txt", sep="\t")
df_cities = pd.read_json("us.cities.json", orient="records")
df_cities["state"] = df_cities.state.str.upper()



In [3]:

    
EXTRA_CITY_STATE_MAPPING = pd.read_csv("EXTRA_CITY_STATE_MAPPING.txt", sep="\t",
                                       header=None, index_col=0)
EXTRA_CITY_STATE_MAPPING.head()









    Out[3]:






  
    
      
      1
    
    
      0
      
    
  
  
    
      NYC
      NY
    
    
      Midwest
      IL
    
    
      Brooklyn
      NY
    
    
      Manhattan
      NY
    
    
      SF Bay Area
      CA



In [4]:

    
EXTRA_CITY_STATE_MAPPING[1].head().to_dict()









    Out[4]:





{'Brooklyn': 'NY',
 'Manhattan': 'NY',
 'Midwest': 'IL',
 'NYC': 'NY',
 'SF Bay Area': 'CA'}



In [5]:

    
STATE_NAMES = dict(zip(df_states["State Name"].values.tolist(), df_states["Abbreviation"].values.tolist()))
STATE_CAPITALS = dict(zip(df_states["Capital"].values.tolist(), df_states["Abbreviation"].values.tolist()))
CITY_STATES = dict(zip(df_cities["city"].values.tolist(), df_cities["state"].values.tolist()))
STATE_ABBR = set(df_states["Abbreviation"].values.tolist())
STATE_ABBR.update(["USA"])
CITY_STATES.update(EXTRA_CITY_STATE_MAPPING[1].to_dict())



In [6]:

    
df.head()









    Out[6]:






  
    
      
      location
      user_counts
      parsed_state
    
  
  
    
      0
      United States
      6906
      NaN
    
    
      1
      USA
      5284
      NaN
    
    
      2
      Washington, DC
      3431
      DC
    
    
      3
      New York, NY
      2225
      NaN
    
    
      4
      Los Angeles, CA
      1979
      CA



In [7]:

    
def get_state_from_text(x):
    if x.upper() in STATE_ABBR:
        return x.upper()
    for k, v in STATE_NAMES.items():
        if k.lower() in x.lower():
            return v
    for k, v in STATE_CAPITALS.items():
        if k.lower() in x.lower():
            return v
    for k, v in CITY_STATES.items():
        if k.lower() in x.lower():
            return v
    for k in x.upper().split():
        k = k.replace(".", "")
        if k in STATE_ABBR:
            return k
    return None

def get_state(x):
    x = x.replace(".", "").strip()
    x_state = x.rsplit(",", 1)
    if len(x_state) < 2:
        return get_state_from_text(x)
    x_state = x_state[-1].strip().upper()
    if len(x_state) == 2:
        if x_state in STATE_ABBR:
            return x_state
    return get_state_from_text(x)



In [8]:

    
df["parse_manual"] = df.location.apply(lambda x: get_state(x))
df.head()









    Out[8]:






  
    
      
      location
      user_counts
      parsed_state
      parse_manual
    
  
  
    
      0
      United States
      6906
      NaN
      USA
    
    
      1
      USA
      5284
      NaN
      USA
    
    
      2
      Washington, DC
      3431
      DC
      DC
    
    
      3
      New York, NY
      2225
      NaN
      NY
    
    
      4
      Los Angeles, CA
      1979
      CA
      CA



In [9]:

    
df[df.parse_manual.isnull()].head(50)









    Out[9]:






  
    
      
      location
      user_counts
      parsed_state
      parse_manual
    
  
  
    
      67
      Earth
      327
      NaN
      None
    
    
      86
      Worldwide
      241
      NaN
      None
    
    
      96
      Down the Rabbit Hole
      225
      NaN
      None
    
    
      105
      Global
      208
      NaN
      None
    
    
      129
      Everywhere
      170
      NaN
      None
    
    
      143
      Planet Earth
      155
      NaN
      None
    
    
      160
      
      139
      NaN
      None
    
    
      194
      Secret
      109
      NaN
      None
    
    
      214
      Jupiter
      98
      NaN
      None
    
    
      268
      Power State
      75
      NaN
      None
    
    
      292
      Hearts & Minds
      68
      NaN
      None
    
    
      305
      Penna.
      63
      NaN
      None
    
    
      314
      The World
      62
      NaN
      None
    
    
      323
      Mars
      59
      NaN
      None
    
    
      325
      DMV
      58
      NaN
      None
    
    
      327
      everywhere
      58
      NaN
      None
    
    
      341
      Flyover, State
      56
      NaN
      None
    
    
      349
      Pacific Northwest
      55
      NaN
      None
    
    
      357
      Virtual
      53
      NaN
      None
    
    
      359
      Snapchat: hdavritt3
      53
      NaN
      None
    
    
      409
      Parts Unknown
      45
      NaN
      None
    
    
      427
      the internets
      43
      NaN
      None
    
    
      445
      Indy, Baby
      41
      NaN
      None
    
    
      467
      Internet
      39
      NaN
      None
    
    
      490
      PNW
      37
      NaN
      None
    
    
      498
      worldwide
      36
      NaN
      None
    
    
      506
      Every Hour
      35
      NaN
      None
    
    
      527
      \n
      33
      NaN
      None
    
    
      555
      Nationwide
      31
      NaN
      None
    
    
      565
      Estados Unidos
      31
      NaN
      None
    
    
      583
      London, England
      30
      NaN
      None
    
    
      585
      757
      30
      NaN
      None
    
    
      588
      The Internet
      30
      NaN
      None
    
    
      594
      Antarctica
      29
      NaN
      None
    
    
      600
      here
      29
      NaN
      None
    
    
      611
      Earth
      28
      NaN
      None
    
    
      613
      Here
      28
      NaN
      None
    
    
      624
      The Mid-West
      27
      NaN
      None
    
    
      629
      Everywhere!
      27
      NaN
      None
    
    
      637
      Classified
      26
      NaN
      None
    
    
      648
      Planet Claire
      26
      NaN
      None
    
    
      650
      World Wide
      26
      NaN
      None
    
    
      653
      Matter Doesn't Matter
      26
      NaN
      None
    
    
      677
      Fire Roger Goodell
      25
      NaN
      None
    
    
      684
      Hell
      25
      NaN
      None
    
    
      689
      MOTHER EARTH (PROTECT HER)
      25
      NaN
      None
    
    
      698
      Everywhere money@
      24
      NaN
      None
    
    
      699
      Pacific North West
      24
      NaN
      None
    
    
      703
      hell
      24
      NaN
      None
    
    
      707
      SF
      24
      NaN
      None



In [10]:

    
df[df.parse_manual.isnull()].shape









    Out[10]:





(14920, 4)



In [11]:

    
df[~df.parse_manual.isnull()].user_counts.sum()









    Out[11]:





169472



In [12]:

    
df[df.parse_manual.isnull()].user_counts.sum()









    Out[12]:





26664



In [13]:

    
get_state_from_text("Upstate NY")









    Out[13]:





'NY'



In [14]:

    
134341.0/df.user_counts.sum()









    Out[14]:





0.6849380022025533



In [15]:

    
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()









    Out[15]:






  
    
      
      location
      user_counts
      parsed_state
      parse_manual
    
  
  
    
      903
      Ann Arbor
      18
      MI
      None
    
    
      994
      Twin Cities
      17
      CA
      None
    
    
      1207
      O-H-I-O
      14
      OH
      None
    
    
      1263
      NorthWoods
      14
      FL
      None
    
    
      1399
      ÜT: 39.770411,-104.959902
      12
      CO
      None



In [16]:

    
df.ix[(df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parse_manual"] = df.ix[
    (df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parsed_state"]
df.head()









    Out[16]:






  
    
      
      location
      user_counts
      parsed_state
      parse_manual
    
  
  
    
      0
      United States
      6906
      NaN
      USA
    
    
      1
      USA
      5284
      NaN
      USA
    
    
      2
      Washington, DC
      3431
      DC
      DC
    
    
      3
      New York, NY
      2225
      NaN
      NY
    
    
      4
      Los Angeles, CA
      1979
      CA
      CA



In [17]:

    
print("Proportion of users geolocated: %s" % (df[~df.parse_manual.isnull()].user_counts.sum() * 1./df.user_counts.sum()))









    



Proportion of users geolocated: 0.8719969816861769



In [18]:

    
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()









    Out[18]:






  
    
      
      location
      user_counts
      parsed_state
      parse_manual



In [19]:

    
df[~df.parse_manual.isnull()].user_counts.sum()









    Out[19]:





171030



In [20]:

    
df.to_csv("PARSED_STATES.final.txt", sep="\t", index=False, encoding="utf-8")
! head PARSED_STATES.final.txt









    



location	user_counts	parsed_state	parse_manual
United States	6906		USA
USA	5284		USA
Washington, DC	3431	DC	DC
New York, NY	2225		NY
Los Angeles, CA	1979	CA	CA
California, USA	1944	CA	CA
New York	1476	NY	NY
Chicago, IL	1322	IL	IL
Los Angeles	1235		CA



In [21]:

    
df.groupby("parse_manual")["user_counts"].sum().sort_values(ascending=False)









    Out[21]:





parse_manual
CA     22601
USA    19821
NY     15064
TX     12258
FL      9156
IL      5593
WA      5254
DC      5184
PA      4749
OH      4630
GA      4451
MA      3992
VA      3774
NC      3641
NJ      3417
AZ      3287
IN      3276
MI      3175
CO      2691
TN      2663
MD      2474
OR      2229
MO      2033
LA      2008
KS      1931
WI      1833
NV      1790
MN      1703
SC      1558
AL      1473
OK      1457
KY      1397
CT      1290
UT      1062
IA       959
MS       791
NE       703
ME       697
NH       592
HI       510
RI       461
AR       456
NM       439
ID       402
WV       332
AK       298
MT       297
VT       281
DE       277
SD       237
WY       176
ND       161
PR        24
AS        10
GU         5
VI         5
MP         2
Name: user_counts, dtype: int64



In [ ]:

	location	user_counts	parsed_state
0	United States	6906	NaN
1	USA	5284	NaN
2	Washington, DC	3431	DC
3	New York, NY	2225	NaN
4	Los Angeles, CA	1979	CA

	location	user_counts	parsed_state	parse_manual
67	Earth	327	NaN	None
86	Worldwide	241	NaN	None
96	Down the Rabbit Hole	225	NaN	None
105	Global	208	NaN	None
129	Everywhere	170	NaN	None
143	Planet Earth	155	NaN	None
160		139	NaN	None
194	Secret	109	NaN	None
214	Jupiter	98	NaN	None
268	Power State	75	NaN	None
292	Hearts & Minds	68	NaN	None
305	Penna.	63	NaN	None
314	The World	62	NaN	None
323	Mars	59	NaN	None
325	DMV	58	NaN	None
327	everywhere	58	NaN	None
341	Flyover, State	56	NaN	None
349	Pacific Northwest	55	NaN	None
357	Virtual	53	NaN	None
359	Snapchat: hdavritt3	53	NaN	None
409	Parts Unknown	45	NaN	None
427	the internets	43	NaN	None
445	Indy, Baby	41	NaN	None
467	Internet	39	NaN	None
490	PNW	37	NaN	None
498	worldwide	36	NaN	None
506	Every Hour	35	NaN	None
527	\n	33	NaN	None
555	Nationwide	31	NaN	None
565	Estados Unidos	31	NaN	None
583	London, England	30	NaN	None
585	757	30	NaN	None
588	The Internet	30	NaN	None
594	Antarctica	29	NaN	None
600	here	29	NaN	None
611	Earth	28	NaN	None
613	Here	28	NaN	None
624	The Mid-West	27	NaN	None
629	Everywhere!	27	NaN	None
637	Classified	26	NaN	None
648	Planet Claire	26	NaN	None
650	World Wide	26	NaN	None
653	Matter Doesn't Matter	26	NaN	None
677	Fire Roger Goodell	25	NaN	None
684	Hell	25	NaN	None
689	MOTHER EARTH (PROTECT HER)	25	NaN	None
698	Everywhere money@	24	NaN	None
699	Pacific North West	24	NaN	None
703	hell	24	NaN	None
707	SF	24	NaN	None

	location	user_counts	parsed_state	parse_manual
903	Ann Arbor	18	MI	None
994	Twin Cities	17	CA	None
1207	O-H-I-O	14	OH	None
1263	NorthWoods	14	FL	None
1399	ÜT: 39.770411,-104.959902	12	CO	None