Specifically download the file, which has all the names data: https://www.ssa.gov/oact/babynames/state/namesbystate.zip
mkdir data
wget https://www.ssa.gov/oact/babynames/state/namesbystate.zip
mkdir SSN_data
cd SSN_data
unzip ../namesbystate.zip
cd ..
cat SSN_data/*.TXT > data/SSN_names_state.txt
Download average state latitude and longitude from: http://dev.maxmind.com/geoip/legacy/codes/state_latlon/
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")
In [2]:
df = pd.read_csv("data/SSN_names_state.txt", header=None)
In [3]:
df.head()
Out[3]:
In [4]:
df.columns = ["State", "Gender", "Year", "Name", "Count"]
In [5]:
df.head()
Out[5]:
In [6]:
df.shape
Out[6]:
In [7]:
df["Gender"].value_counts() # Instances per gender
Out[7]:
In [8]:
# Calculate number of names of each gender per year
df_t = df.pivot_table(values="Count", index="Year", columns="Gender", aggfunc=np.sum)
df_t.sum(axis=0)
Out[8]:
In [9]:
df_a = df_t.div(df_t.sum(axis=1), axis=0)*100
plt.plot(df_a.index, df_a["F"], label="Female", marker="o")
plt.plot(df_a.index, df_a["M"], label="Male", marker="s")
plt.xlabel("Year")
plt.ylabel("Percentage of names")
plt.legend(frameon=True, fancybox=True)
Out[9]:
In [10]:
# Get top gender names per state
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot_table(values=["Name", "Count", "Year"], columns="Gender", index="State", aggfunc=lambda x: x.iloc[0])\
.swaplevel(0, 1, axis=1).sort_index(axis=1)
Out[10]:
In [11]:
# Get top gender names per state
# Same as above but more efficient syntax
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot(index="State", columns="Gender").swaplevel(0, 1, axis=1).sort_index(axis=1)
Out[11]:
In [12]:
# Do for full data:
df_props = df.pivot_table(index=["Name", "State", "Year"], columns="Gender")["Count"].reset_index().fillna(0)
In [13]:
df_props.head()
Out[13]:
In [14]:
df_props.columns
Out[14]:
In [15]:
df_props.shape, df.shape
Out[15]:
In [16]:
# Get top 20 names which have occured in the maximum number of years
df.groupby("Name")["Year"].apply(lambda x: len(x.unique())).sort(ascending=False, inplace=False).head(20)
Out[16]:
In [17]:
df_props["Total"] = df_props[["F", "M"]].sum(axis=1)
df_props["MaxCount"] = df_props[["F", "M"]].max(axis=1)
df_props.head()
Out[17]:
In [31]:
# Write names per gender to file
df_props.to_csv("data/ssn_namesbystate_counts.txt", sep="\t", index=False)
In [18]:
(df_props[["F", "M"]].div(df_props["Total"], axis=0).max(axis=1) > 0.9).mean()
Out[18]:
In [19]:
df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1).head()
Out[19]:
In [20]:
df_props["BestGender"] = df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1)
In [21]:
df_props.head()
Out[21]:
In [22]:
df["Name"].unique().shape
Out[22]:
In [23]:
df_props[df_props["Total"] > 100].shape
Out[23]:
In [24]:
df[(df["Name"] == "Aaban") & (df["State"] == "NY") & (df["Year"] == 2013)]
Out[24]:
In [25]:
df_props.describe()
Out[25]:
In [26]:
df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].shape
Out[26]:
In [27]:
# Filter names to only include names which occur in 500 times per year and are 100% of a given gender
df_props_filtered = df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].copy()
df_props_filtered.to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)
In [28]:
# Create one dataset for getting only names and genders
df_names = df_props.groupby("Name")[["F", "M"]].sum().reset_index()
df_names.head()
Out[28]:
In [29]:
df_names["Total"] = df_names[["F", "M"]].sum(axis=1)
df_names["MaxCount"] = df_names[["F", "M"]].max(axis=1)
df_names["BestGender"] = df_names[["F", "M"]].div(df_names["Total"], axis=0).idxmax(axis=1)
df_names.head()
Out[29]:
In [93]:
# First add start and end characters to identify beginning and end of chars
df_props_filtered["Name_proc"] = df_props_filtered.Name.apply(lambda x: ("^%s$" % x.strip()))
df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)
df_names["Name_proc"] = df_names.Name.apply(lambda x: "^%s$" % x.strip())
#df_names.head()
df_names[["Name","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_names.txt", sep="\t", index=False)
# Get latitude and longitudes for each state
df_states_latlong = pd.read_csv("data/state_latlon.csv")
df_states_latlong.columns = ["State", "Latitude", "Longitude"]
df_names_latlong = pd.merge(df_props_filtered, df_states_latlong, how="left", on="State")
#pd.merge(df, df_states_latlong, how="left", on="State").to_csv("data/SSN_names_state.latlong.txt", index=False, sep="\t")
In [95]:
df_props_year = df_props.groupby(["Name", "Year"]).sum().reset_index()
In [96]:
print df_props_year.shape
df_props_year.head()
Out[96]:
In [99]:
df_props_year["BestGender"] = df_props_year[["F", "M"]].div(df_props_year["Total"], axis=0).idxmax(axis=1)
df_props_year.head()
Out[99]:
In [104]:
gg = df_props_year[df_props_year.Total > 1000].groupby(["Year", "BestGender"])
In [109]:
#gg.filter(lambda x: x.Total > (np.random.rand(x.Total.shape[0])*x.Total.max())).head()
In [34]:
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_names.Name_proc.values)
Out[34]:
In [35]:
cvec.transform(df_names.Name_proc.values[:10])
Out[35]:
In [36]:
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = cvec.transform(df_names.iloc[train_ids].Name_proc.values), df_names.iloc[train_ids].BestGender
X_test, y_test = cvec.transform(df_names.iloc[test_ids].Name_proc.values), df_names.iloc[test_ids].BestGender
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(cvec.transform(df_names.Name_proc.values),df_names.BestGender, test_size=0.4, random_state=100)
model.fit(X_train, y_train)
Out[36]:
In [37]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Out[37]:
In [38]:
y_pred = model.predict(X_test)
In [39]:
print classification_report(y_pred, y_test)
In [42]:
pd.concat((df_names.iloc[test_ids][["Name", "BestGender"]], pd.Series(y_pred, name="Predicted", index=test_ids)), axis=1).head()
Out[42]:
In [43]:
df_model_coeffs = pd.DataFrame({"Feature": cvec.get_feature_names(), "Coeff": model.coef_[0]})
In [44]:
df_model_coeffs["Feature_len"] = df_model_coeffs["Feature"].apply(lambda x: len(x))
df_model_coeffs.sort("Coeff", ascending=False).head(20)
Out[44]:
In [45]:
df_names[df_names.Name_proc.str.contains("ndel")]
Out[45]:
In [ ]:
df_sample = df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]]
In [47]:
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_props_filtered.Name_proc)
cvec.transform(df_props_filtered.Name_proc.head()).todense().shape
Out[47]:
In [48]:
class ColumnFeatures(TransformerMixin):
def __init__(self, colname, to_df=True):
print "Initialized extractor for column %s" % colname
self.colname = colname
self.to_df = to_df
def get_feature_names(self):
return [self.colname]
def transform(self, X, **transform_params):
print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
if self.to_df:
return pd.DataFrame(X[self.colname])
return X[self.colname]
def fit(self, X, y=None, **fit_params):
return self
class IdentityTransformer(TransformerMixin):
def transform(self, X, **transform_params):
print "X processed by parent. Output shape: %s" % (X.shape, )
return X
def fit(self, X, y=None, **fit_params):
return self
class DenseTransformer(TransformerMixin):
def transform(self, X, **transform_params):
print "New shape: ", X.todense().shape
return X.todense()
def fit(self, X, y=None, **fit_params):
return self
class MultiColumnExtractor(TransformerMixin):
def __init__(self, colnames):
print "Initialized extractor for column %s" % colnames
self.colnames = colnames
def get_feature_names(self):
return self.colnames
def transform(self, X, **transform_params):
print "Extracting columns [%s]" % (self.colnames,)
return pd.DataFrame(X[self.colnames])
def fit(self, X, y=None, **fit_params):
return self
pipeline = Pipeline([
("features", FeatureUnion([
("year", Pipeline([
("year_val", ColumnFeatures(colname="Year")),
("year_norm", StandardScaler())
])),
("state", Pipeline([
("state_val", ColumnFeatures(colname="State")),
("state_cat", LabelBinarizer())
])),
("names", Pipeline([
("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
])),
]))
])
In [49]:
pipeline.fit_transform(df_props_filtered).shape
Out[49]:
In [50]:
df_props_filtered.values[:10]
Out[50]:
In [51]:
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = pipeline.transform(df_props_filtered.iloc[train_ids]), df_props_filtered.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_props_filtered.iloc[test_ids]), df_props_filtered.iloc[test_ids].BestGender
In [52]:
model.fit(X_train, y_train)
Out[52]:
In [53]:
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)
In [54]:
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": ["Year"] + f.transformer_list[1][1].named_steps["state_cat"].classes_.tolist() + f.transformer_list[2][1].named_steps["name_ngram"].get_feature_names(), "Coeff": model.coef_[0]})
In [55]:
df_model_coeffs.sort("Coeff", ascending=True).head(20)
Out[55]:
In [61]:
pipeline = Pipeline([
("features", FeatureUnion([
("year_latlong", Pipeline([
("year_val", MultiColumnExtractor(colnames=["Year", "Latitude", "Longitude"])),
("year_norm", StandardScaler())
])),
("names", Pipeline([
("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
])),
]))
])
In [62]:
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = pipeline.fit_transform(df_names_latlong.iloc[train_ids]), df_names_latlong.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_names_latlong.iloc[test_ids]), df_names_latlong.iloc[test_ids].BestGender
In [63]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)
In [64]:
df_props.head()
Out[64]:
In [65]:
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": f.transformer_list[0][1].named_steps["year_val"].get_feature_names()\
+ f.transformer_list[1][1].named_steps["name_ngram"].get_feature_names(),\
"Coeff": model.coef_[0]})
In [66]:
f = pipeline.named_steps["features"]
In [67]:
f.transformer_list[0][1].named_steps["year_val"].get_feature_names()
Out[67]:
In [68]:
df_model_coeffs.head()
Out[68]:
In [69]:
df_t = df_props.groupby("Name")[["F", "M"]].max()
df_t.head()
Out[69]:
In [70]:
df_t["Prop"] = np.abs(df_t["F"] - df_t["M"]) / (df_t["F"] + df_t["M"])
df_t[(df_t["Prop"] < 0.1) & ((df_t["F"] + df_t["M"]) > 100)].sort("Prop")
Out[70]:
In [71]:
df_t = df_props.groupby(["Name", "Year"])[["F", "M"]].sum().reset_index()
df_t["BestGender"] = df_t[["F", "M"]].idxmax(axis=1)
In [72]:
df_t.head()
Out[72]:
In [80]:
clust_model = AgglomerativeClustering(n_clusters=10)
In [81]:
cids = clust_model.fit_predict(cvec.transform(df_names.Name_proc.head(100)).todense())
In [82]:
cids
Out[82]:
In [90]:
pd.concat((df_names.Name.head(100), pd.Series(cids, name="ClusterIDs", index=df_names.Name_proc.head(100).index)), axis=1).head()
Out[90]:
In [ ]: