In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("PARSED_STATES.txt", sep="\t")
df.parsed_state = df.parsed_state.str.upper()
df_states = pd.read_csv("US_STATES.txt", sep="\t")
df_cities = pd.read_json("us.cities.json", orient="records")
df_cities["state"] = df_cities.state.str.upper()

In [3]:
EXTRA_CITY_STATE_MAPPING = pd.read_csv("EXTRA_CITY_STATE_MAPPING.txt", sep="\t",
                                       header=None, index_col=0)
EXTRA_CITY_STATE_MAPPING.head()


Out[3]:
1
0
NYC NY
Midwest IL
Brooklyn NY
Manhattan NY
SF Bay Area CA

In [4]:
EXTRA_CITY_STATE_MAPPING[1].head().to_dict()


Out[4]:
{'Brooklyn': 'NY',
 'Manhattan': 'NY',
 'Midwest': 'IL',
 'NYC': 'NY',
 'SF Bay Area': 'CA'}

In [5]:
STATE_NAMES = dict(zip(df_states["State Name"].values.tolist(), df_states["Abbreviation"].values.tolist()))
STATE_CAPITALS = dict(zip(df_states["Capital"].values.tolist(), df_states["Abbreviation"].values.tolist()))
CITY_STATES = dict(zip(df_cities["city"].values.tolist(), df_cities["state"].values.tolist()))
STATE_ABBR = set(df_states["Abbreviation"].values.tolist())
STATE_ABBR.update(["USA"])
CITY_STATES.update(EXTRA_CITY_STATE_MAPPING[1].to_dict())

In [6]:
df.head()


Out[6]:
location user_counts parsed_state
0 United States 6906 NaN
1 USA 5284 NaN
2 Washington, DC 3431 DC
3 New York, NY 2225 NaN
4 Los Angeles, CA 1979 CA

In [7]:
def get_state_from_text(x):
    if x.upper() in STATE_ABBR:
        return x.upper()
    for k, v in STATE_NAMES.items():
        if k.lower() in x.lower():
            return v
    for k, v in STATE_CAPITALS.items():
        if k.lower() in x.lower():
            return v
    for k, v in CITY_STATES.items():
        if k.lower() in x.lower():
            return v
    for k in x.upper().split():
        k = k.replace(".", "")
        if k in STATE_ABBR:
            return k
    return None

def get_state(x):
    x = x.replace(".", "").strip()
    x_state = x.rsplit(",", 1)
    if len(x_state) < 2:
        return get_state_from_text(x)
    x_state = x_state[-1].strip().upper()
    if len(x_state) == 2:
        if x_state in STATE_ABBR:
            return x_state
    return get_state_from_text(x)

In [8]:
df["parse_manual"] = df.location.apply(lambda x: get_state(x))
df.head()


Out[8]:
location user_counts parsed_state parse_manual
0 United States 6906 NaN USA
1 USA 5284 NaN USA
2 Washington, DC 3431 DC DC
3 New York, NY 2225 NaN NY
4 Los Angeles, CA 1979 CA CA

In [9]:
df[df.parse_manual.isnull()].head(50)


Out[9]:
location user_counts parsed_state parse_manual
67 Earth 327 NaN None
86 Worldwide 241 NaN None
96 Down the Rabbit Hole 225 NaN None
105 Global 208 NaN None
129 Everywhere 170 NaN None
143 Planet Earth 155 NaN None
160 139 NaN None
194 Secret 109 NaN None
214 Jupiter 98 NaN None
268 Power State 75 NaN None
292 Hearts & Minds 68 NaN None
305 Penna. 63 NaN None
314 The World 62 NaN None
323 Mars 59 NaN None
325 DMV 58 NaN None
327 everywhere 58 NaN None
341 Flyover, State 56 NaN None
349 Pacific Northwest 55 NaN None
357 Virtual 53 NaN None
359 Snapchat: hdavritt3 53 NaN None
409 Parts Unknown 45 NaN None
427 the internets 43 NaN None
445 Indy, Baby 41 NaN None
467 Internet 39 NaN None
490 PNW 37 NaN None
498 worldwide 36 NaN None
506 Every Hour 35 NaN None
527 \n 33 NaN None
555 Nationwide 31 NaN None
565 Estados Unidos 31 NaN None
583 London, England 30 NaN None
585 757 30 NaN None
588 The Internet 30 NaN None
594 Antarctica 29 NaN None
600 here 29 NaN None
611 Earth 28 NaN None
613 Here 28 NaN None
624 The Mid-West 27 NaN None
629 Everywhere! 27 NaN None
637 Classified 26 NaN None
648 Planet Claire 26 NaN None
650 World Wide 26 NaN None
653 Matter Doesn't Matter 26 NaN None
677 Fire Roger Goodell 25 NaN None
684 Hell 25 NaN None
689 MOTHER EARTH (PROTECT HER) 25 NaN None
698 Everywhere money@ 24 NaN None
699 Pacific North West 24 NaN None
703 hell 24 NaN None
707 SF 24 NaN None

In [10]:
df[df.parse_manual.isnull()].shape


Out[10]:
(14920, 4)

In [11]:
df[~df.parse_manual.isnull()].user_counts.sum()


Out[11]:
169472

In [12]:
df[df.parse_manual.isnull()].user_counts.sum()


Out[12]:
26664

In [13]:
get_state_from_text("Upstate NY")


Out[13]:
'NY'

In [14]:
134341.0/df.user_counts.sum()


Out[14]:
0.6849380022025533

In [15]:
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()


Out[15]:
location user_counts parsed_state parse_manual
903 Ann Arbor 18 MI None
994 Twin Cities 17 CA None
1207 O-H-I-O 14 OH None
1263 NorthWoods 14 FL None
1399 ÜT: 39.770411,-104.959902 12 CO None

In [16]:
df.ix[(df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parse_manual"] = df.ix[
    (df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parsed_state"]
df.head()


Out[16]:
location user_counts parsed_state parse_manual
0 United States 6906 NaN USA
1 USA 5284 NaN USA
2 Washington, DC 3431 DC DC
3 New York, NY 2225 NaN NY
4 Los Angeles, CA 1979 CA CA

In [17]:
print("Proportion of users geolocated: %s" % (df[~df.parse_manual.isnull()].user_counts.sum() * 1./df.user_counts.sum()))


Proportion of users geolocated: 0.8719969816861769

In [18]:
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()


Out[18]:
location user_counts parsed_state parse_manual

In [19]:
df[~df.parse_manual.isnull()].user_counts.sum()


Out[19]:
171030

In [20]:
df.to_csv("PARSED_STATES.final.txt", sep="\t", index=False, encoding="utf-8")
! head PARSED_STATES.final.txt


location	user_counts	parsed_state	parse_manual
United States	6906		USA
USA	5284		USA
Washington, DC	3431	DC	DC
New York, NY	2225		NY
Los Angeles, CA	1979	CA	CA
California, USA	1944	CA	CA
New York	1476	NY	NY
Chicago, IL	1322	IL	IL
Los Angeles	1235		CA

In [21]:
df.groupby("parse_manual")["user_counts"].sum().sort_values(ascending=False)


Out[21]:
parse_manual
CA     22601
USA    19821
NY     15064
TX     12258
FL      9156
IL      5593
WA      5254
DC      5184
PA      4749
OH      4630
GA      4451
MA      3992
VA      3774
NC      3641
NJ      3417
AZ      3287
IN      3276
MI      3175
CO      2691
TN      2663
MD      2474
OR      2229
MO      2033
LA      2008
KS      1931
WI      1833
NV      1790
MN      1703
SC      1558
AL      1473
OK      1457
KY      1397
CT      1290
UT      1062
IA       959
MS       791
NE       703
ME       697
NH       592
HI       510
RI       461
AR       456
NM       439
ID       402
WV       332
AK       298
MT       297
VT       281
DE       277
SD       237
WY       176
ND       161
PR        24
AS        10
GU         5
VI         5
MP         2
Name: user_counts, dtype: int64

In [ ]: