In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("PARSED_STATES.txt", sep="\t")
df.parsed_state = df.parsed_state.str.upper()
df_states = pd.read_csv("US_STATES.txt", sep="\t")
df_cities = pd.read_json("us.cities.json", orient="records")
df_cities["state"] = df_cities.state.str.upper()
In [3]:
EXTRA_CITY_STATE_MAPPING = pd.read_csv("EXTRA_CITY_STATE_MAPPING.txt", sep="\t",
header=None, index_col=0)
EXTRA_CITY_STATE_MAPPING.head()
Out[3]:
In [4]:
EXTRA_CITY_STATE_MAPPING[1].head().to_dict()
Out[4]:
In [5]:
STATE_NAMES = dict(zip(df_states["State Name"].values.tolist(), df_states["Abbreviation"].values.tolist()))
STATE_CAPITALS = dict(zip(df_states["Capital"].values.tolist(), df_states["Abbreviation"].values.tolist()))
CITY_STATES = dict(zip(df_cities["city"].values.tolist(), df_cities["state"].values.tolist()))
STATE_ABBR = set(df_states["Abbreviation"].values.tolist())
STATE_ABBR.update(["USA"])
CITY_STATES.update(EXTRA_CITY_STATE_MAPPING[1].to_dict())
In [6]:
df.head()
Out[6]:
In [7]:
def get_state_from_text(x):
if x.upper() in STATE_ABBR:
return x.upper()
for k, v in STATE_NAMES.items():
if k.lower() in x.lower():
return v
for k, v in STATE_CAPITALS.items():
if k.lower() in x.lower():
return v
for k, v in CITY_STATES.items():
if k.lower() in x.lower():
return v
for k in x.upper().split():
k = k.replace(".", "")
if k in STATE_ABBR:
return k
return None
def get_state(x):
x = x.replace(".", "").strip()
x_state = x.rsplit(",", 1)
if len(x_state) < 2:
return get_state_from_text(x)
x_state = x_state[-1].strip().upper()
if len(x_state) == 2:
if x_state in STATE_ABBR:
return x_state
return get_state_from_text(x)
In [8]:
df["parse_manual"] = df.location.apply(lambda x: get_state(x))
df.head()
Out[8]:
In [9]:
df[df.parse_manual.isnull()].head(50)
Out[9]:
In [10]:
df[df.parse_manual.isnull()].shape
Out[10]:
In [11]:
df[~df.parse_manual.isnull()].user_counts.sum()
Out[11]:
In [12]:
df[df.parse_manual.isnull()].user_counts.sum()
Out[12]:
In [13]:
get_state_from_text("Upstate NY")
Out[13]:
In [14]:
134341.0/df.user_counts.sum()
Out[14]:
In [15]:
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()
Out[15]:
In [16]:
df.ix[(df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parse_manual"] = df.ix[
(df.parse_manual.isnull()) & (~df.parsed_state.isnull()), "parsed_state"]
df.head()
Out[16]:
In [17]:
print("Proportion of users geolocated: %s" % (df[~df.parse_manual.isnull()].user_counts.sum() * 1./df.user_counts.sum()))
In [18]:
df[(df.parse_manual.isnull()) & (~df.parsed_state.isnull())].head()
Out[18]:
In [19]:
df[~df.parse_manual.isnull()].user_counts.sum()
Out[19]:
In [20]:
df.to_csv("PARSED_STATES.final.txt", sep="\t", index=False, encoding="utf-8")
! head PARSED_STATES.final.txt
In [21]:
df.groupby("parse_manual")["user_counts"].sum().sort_values(ascending=False)
Out[21]:
In [ ]: