Specifically download the file, which has all the names data: https://www.ssa.gov/oact/babynames/state/namesbystate.zip
  mkdir data
  wget https://www.ssa.gov/oact/babynames/state/namesbystate.zip
  mkdir SSN_data
  cd SSN_data
  unzip ../namesbystate.zip
  cd ..
  cat SSN_data/*.TXT > data/SSN_names_state.txt
Download average state latitude and longitude from: http://dev.maxmind.com/geoip/legacy/codes/state_latlon/
In [1]:
    
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn import cross_validation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster.hierarchical import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("ticks")
sns.set_context("poster")
    
In [2]:
    
df = pd.read_csv("data/SSN_names_state.txt", header=None)
    
In [3]:
    
df.head()
    
    Out[3]:
In [4]:
    
df.columns = ["State", "Gender", "Year", "Name", "Count"]
    
In [5]:
    
df.head()
    
    Out[5]:
In [6]:
    
df.shape
    
    Out[6]:
In [7]:
    
df["Gender"].value_counts() # Instances per gender
    
    Out[7]:
In [8]:
    
# Calculate number of names of each gender per year
df_t = df.pivot_table(values="Count", index="Year", columns="Gender", aggfunc=np.sum)
df_t.sum(axis=0)
    
    Out[8]:
In [9]:
    
df_a = df_t.div(df_t.sum(axis=1), axis=0)*100
plt.plot(df_a.index, df_a["F"], label="Female", marker="o")
plt.plot(df_a.index, df_a["M"], label="Male", marker="s")
plt.xlabel("Year")
plt.ylabel("Percentage of names")
plt.legend(frameon=True, fancybox=True)
    
    Out[9]:
    
In [10]:
    
# Get top gender names per state
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot_table(values=["Name", "Count", "Year"], columns="Gender", index="State", aggfunc=lambda x: x.iloc[0])\
.swaplevel(0, 1, axis=1).sort_index(axis=1)
    
    Out[10]:
In [11]:
    
# Get top gender names per state
# Same as above but more efficient syntax
df.sort(["Count"], ascending=False).groupby(["State", "Gender"]).head(1)\
.pivot(index="State", columns="Gender").swaplevel(0, 1, axis=1).sort_index(axis=1)
    
    Out[11]:
In [12]:
    
# Do for full data:
df_props = df.pivot_table(index=["Name", "State", "Year"], columns="Gender")["Count"].reset_index().fillna(0)
    
In [13]:
    
df_props.head()
    
    Out[13]:
In [14]:
    
df_props.columns
    
    Out[14]:
In [15]:
    
df_props.shape, df.shape
    
    Out[15]:
In [16]:
    
# Get top 20 names which have occured in the maximum number of years
df.groupby("Name")["Year"].apply(lambda x: len(x.unique())).sort(ascending=False, inplace=False).head(20)
    
    Out[16]:
In [17]:
    
df_props["Total"] = df_props[["F", "M"]].sum(axis=1)
df_props["MaxCount"] = df_props[["F", "M"]].max(axis=1)
df_props.head()
    
    Out[17]:
In [31]:
    
# Write names per gender to file
df_props.to_csv("data/ssn_namesbystate_counts.txt", sep="\t", index=False)
    
In [18]:
    
(df_props[["F", "M"]].div(df_props["Total"], axis=0).max(axis=1) > 0.9).mean()
    
    Out[18]:
In [19]:
    
df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1).head()
    
    Out[19]:
In [20]:
    
df_props["BestGender"] = df_props[["F", "M"]].div(df_props["Total"], axis=0).idxmax(axis=1)
    
In [21]:
    
df_props.head()
    
    Out[21]:
In [22]:
    
df["Name"].unique().shape
    
    Out[22]:
In [23]:
    
df_props[df_props["Total"] > 100].shape
    
    Out[23]:
In [24]:
    
df[(df["Name"] == "Aaban") & (df["State"] == "NY") & (df["Year"] == 2013)]
    
    Out[24]:
In [25]:
    
df_props.describe()
    
    Out[25]:
In [26]:
    
df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].shape
    
    Out[26]:
In [27]:
    
# Filter names to only include names which occur in 500 times per year and are 100% of a given gender
df_props_filtered = df_props[(df_props["Total"] > 500) & ((df_props["Total"]/df_props["MaxCount"]) == 1)].copy()
df_props_filtered.to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)
    
In [28]:
    
# Create one dataset for getting only names and genders
df_names = df_props.groupby("Name")[["F", "M"]].sum().reset_index()
df_names.head()
    
    Out[28]:
In [29]:
    
df_names["Total"] = df_names[["F", "M"]].sum(axis=1)
df_names["MaxCount"] = df_names[["F", "M"]].max(axis=1)
df_names["BestGender"] = df_names[["F", "M"]].div(df_names["Total"], axis=0).idxmax(axis=1)
df_names.head()
    
    Out[29]:
In [93]:
    
# First add start and end characters to identify beginning and end of chars
df_props_filtered["Name_proc"] = df_props_filtered.Name.apply(lambda x: ("^%s$" % x.strip()))
df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_namesbystate_counts.filtered.txt", sep="\t", index=False)
df_names["Name_proc"] = df_names.Name.apply(lambda x: "^%s$" % x.strip())
#df_names.head()
df_names[["Name","Name_proc", "F", "M", "MaxCount","Total","BestGender"]].to_csv("data/ssn_names.txt", sep="\t", index=False)
# Get latitude and longitudes for each state
df_states_latlong = pd.read_csv("data/state_latlon.csv")
df_states_latlong.columns = ["State", "Latitude", "Longitude"]
df_names_latlong = pd.merge(df_props_filtered, df_states_latlong, how="left", on="State")
#pd.merge(df, df_states_latlong, how="left", on="State").to_csv("data/SSN_names_state.latlong.txt", index=False, sep="\t")
    
In [95]:
    
df_props_year = df_props.groupby(["Name", "Year"]).sum().reset_index()
    
In [96]:
    
print df_props_year.shape
df_props_year.head()
    
    
    Out[96]:
In [99]:
    
df_props_year["BestGender"] = df_props_year[["F", "M"]].div(df_props_year["Total"], axis=0).idxmax(axis=1)
df_props_year.head()
    
    Out[99]:
In [104]:
    
gg = df_props_year[df_props_year.Total > 1000].groupby(["Year", "BestGender"])
    
In [109]:
    
#gg.filter(lambda x: x.Total > (np.random.rand(x.Total.shape[0])*x.Total.max())).head()
    
In [34]:
    
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_names.Name_proc.values)
    
    Out[34]:
In [35]:
    
cvec.transform(df_names.Name_proc.values[:10])
    
    Out[35]:
In [36]:
    
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = cvec.transform(df_names.iloc[train_ids].Name_proc.values), df_names.iloc[train_ids].BestGender
X_test, y_test = cvec.transform(df_names.iloc[test_ids].Name_proc.values), df_names.iloc[test_ids].BestGender
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(cvec.transform(df_names.Name_proc.values),df_names.BestGender, test_size=0.4, random_state=100)
model.fit(X_train, y_train)
    
    Out[36]:
In [37]:
    
X_train.shape, y_train.shape, X_test.shape, y_test.shape
    
    Out[37]:
In [38]:
    
y_pred = model.predict(X_test)
    
In [39]:
    
print classification_report(y_pred, y_test)
    
    
In [42]:
    
pd.concat((df_names.iloc[test_ids][["Name", "BestGender"]], pd.Series(y_pred, name="Predicted", index=test_ids)), axis=1).head()
    
    Out[42]:
In [43]:
    
df_model_coeffs = pd.DataFrame({"Feature": cvec.get_feature_names(), "Coeff": model.coef_[0]})
    
In [44]:
    
df_model_coeffs["Feature_len"] = df_model_coeffs["Feature"].apply(lambda x: len(x))
df_model_coeffs.sort("Coeff", ascending=False).head(20)
    
    Out[44]:
In [45]:
    
df_names[df_names.Name_proc.str.contains("ndel")]
    
    Out[45]:
In [ ]:
    
df_sample = df_props_filtered[["Name", "Year","State","Name_proc", "F", "M", "MaxCount","Total","BestGender"]]
    
In [47]:
    
cvec = CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)
cvec.fit(df_props_filtered.Name_proc)
cvec.transform(df_props_filtered.Name_proc.head()).todense().shape
    
    Out[47]:
In [48]:
    
class ColumnFeatures(TransformerMixin):
    def __init__(self, colname, to_df=True):
        print "Initialized extractor for column %s" % colname
        self.colname = colname
        self.to_df = to_df
    def get_feature_names(self):
        return [self.colname]
    def transform(self, X, **transform_params):
        print "Extracting column [%s], to_df = %s" % (self.colname, self.to_df)
        if self.to_df:
            return pd.DataFrame(X[self.colname])
        return X[self.colname]
    def fit(self, X, y=None, **fit_params):
        return self
class IdentityTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "X processed by parent. Output shape: %s" % (X.shape, )
        return X
    def fit(self, X, y=None, **fit_params):
        return self
    
class DenseTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        print "New shape: ",  X.todense().shape
        return X.todense()
    def fit(self, X, y=None, **fit_params):
        return self
    
class MultiColumnExtractor(TransformerMixin):
    def __init__(self, colnames):
        print "Initialized extractor for column %s" % colnames
        self.colnames = colnames
    def get_feature_names(self):
        return self.colnames
    def transform(self, X, **transform_params):
        print "Extracting columns [%s]" % (self.colnames,)
        return pd.DataFrame(X[self.colnames])
    def fit(self, X, y=None, **fit_params):
        return self
        
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year",  Pipeline([
                                ("year_val", ColumnFeatures(colname="Year")),
                                ("year_norm", StandardScaler())
                    ])),
                    ("state", Pipeline([
                               ("state_val", ColumnFeatures(colname="State")),
                               ("state_cat", LabelBinarizer())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])
    
    
In [49]:
    
pipeline.fit_transform(df_props_filtered).shape
    
    
    
    Out[49]:
In [50]:
    
df_props_filtered.values[:10]
    
    Out[50]:
In [51]:
    
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = pipeline.transform(df_props_filtered.iloc[train_ids]), df_props_filtered.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_props_filtered.iloc[test_ids]), df_props_filtered.iloc[test_ids].BestGender
    
    
In [52]:
    
model.fit(X_train, y_train)
    
    Out[52]:
In [53]:
    
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)
    
    
In [54]:
    
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": ["Year"] + f.transformer_list[1][1].named_steps["state_cat"].classes_.tolist() + f.transformer_list[2][1].named_steps["name_ngram"].get_feature_names(), "Coeff": model.coef_[0]})
    
In [55]:
    
df_model_coeffs.sort("Coeff", ascending=True).head(20)
    
    Out[55]:
In [61]:
    
pipeline = Pipeline([
        ("features", FeatureUnion([
                    ("year_latlong",  Pipeline([
                                ("year_val", MultiColumnExtractor(colnames=["Year", "Latitude", "Longitude"])),
                                ("year_norm", StandardScaler())
                    ])),
                    ("names", Pipeline([
                               ("name_val", ColumnFeatures(colname="Name_proc", to_df=False)),
                               ("name_ngram", CountVectorizer(analyzer="char", ngram_range=(2, 4), binary=True, lowercase=False, min_df=10)),
                    ])),
                    
                ]))
    ])
    
    
In [62]:
    
model = LogisticRegression(penalty="l1")
np.random.seed(1337)
shuffle_ids = np.random.permutation(df_names.shape[0])
split_point = int(len(shuffle_ids)*0.6)
train_ids, test_ids = shuffle_ids[:split_point], shuffle_ids[split_point:]
train_ids.shape, test_ids.shape
X_train, y_train = pipeline.fit_transform(df_names_latlong.iloc[train_ids]), df_names_latlong.iloc[train_ids].BestGender
X_test, y_test = pipeline.transform(df_names_latlong.iloc[test_ids]), df_names_latlong.iloc[test_ids].BestGender
    
    
In [63]:
    
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print classification_report(y_pred, y_test)
    
    
In [64]:
    
df_props.head()
    
    Out[64]:
In [65]:
    
f = pipeline.named_steps["features"]
df_model_coeffs = pd.DataFrame({"Feature": f.transformer_list[0][1].named_steps["year_val"].get_feature_names()\
                                + f.transformer_list[1][1].named_steps["name_ngram"].get_feature_names(),\
                                "Coeff": model.coef_[0]})
    
In [66]:
    
f = pipeline.named_steps["features"]
    
In [67]:
    
f.transformer_list[0][1].named_steps["year_val"].get_feature_names()
    
    Out[67]:
In [68]:
    
df_model_coeffs.head()
    
    Out[68]:
In [69]:
    
df_t = df_props.groupby("Name")[["F", "M"]].max()
df_t.head()
    
    Out[69]:
In [70]:
    
df_t["Prop"] = np.abs(df_t["F"] - df_t["M"]) / (df_t["F"] + df_t["M"])
df_t[(df_t["Prop"] < 0.1) & ((df_t["F"] + df_t["M"]) > 100)].sort("Prop")
    
    Out[70]:
In [71]:
    
df_t = df_props.groupby(["Name", "Year"])[["F", "M"]].sum().reset_index()
df_t["BestGender"] = df_t[["F", "M"]].idxmax(axis=1)
    
In [72]:
    
df_t.head()
    
    Out[72]:
In [80]:
    
clust_model = AgglomerativeClustering(n_clusters=10)
    
In [81]:
    
cids = clust_model.fit_predict(cvec.transform(df_names.Name_proc.head(100)).todense())
    
In [82]:
    
cids
    
    Out[82]:
In [90]:
    
pd.concat((df_names.Name.head(100), pd.Series(cids, name="ClusterIDs", index=df_names.Name_proc.head(100).index)), axis=1).head()
    
    Out[90]:
In [ ]: