In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
import string
import unidecode
s = "Historia de la Inquisición' "
s = s.decode('utf-8')
s = unidecode.unidecode(s)
s = s.strip()
s = s.strip(string.punctuation)
s = s.lower()
s
Out[2]:
In [3]:
f1 = "madrid, espana"
f2 = "madrid [espana"
f3 = "madrid (espana"
(f1.split(","), f2.split("["), f3.split("("))
# ... A few more string operations like strip for whitespace etc.
Out[3]:
In [5]:
locs = pd.read_csv("../data/locs/just_split_locs.csv")
locs = locs[["Country", "City"]].dropna(how="all")
In [5]:
loc_groups = locs.groupby(["City", "Country"]).groups
In [6]:
"Num. of unique country/city combinations: {0}".format(len(loc_groups))
Out[6]:
In [7]:
import operator
d = {k: len(v) for k, v in loc_groups.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
In [8]:
"Percent of data set covered by 5000 most common combos: {0}".format(
sum([count for loc, count in s[:5000]]) / float(len(locs)))
Out[8]:
In [9]:
text_df = pd.read_csv("../data/geo_cleaned_texts.tsv", sep="\t")
In [11]:
clean_locs = text_df.groupby("slug")
In [14]:
counted = clean_locs.count()["control_number"]
In [20]:
counted.sort(inplace=False, ascending=False).ix[:10].plot(kind="bar")
Out[20]:
In [21]:
from fuzzywuzzy import fuzz
cerv_df = text_df[
text_df.apply(
lambda x: fuzz.token_set_ratio("Miguel Cervantes", x["author"]) == 100 or \
fuzz.token_sort_ratio("Miguel Cervantes", x["author"]) > 90, axis=1)]
In [23]:
cerv_df.head(10)
Out[23]:
In [24]:
len(cerv_df)
Out[24]:
In [34]:
cerv_locs = cerv_df.groupby("slug")
counted_cerv_locs = cerv_locs.count()["control_number"]
counted_cerv_locs.sort(inplace=False, ascending=False).ix[:1]
Out[34]:
In [48]:
text_groups = text_df.groupby(["author", "title"]).groups
In [49]:
"Num. of unique title/author combinations: {0}".format(len(text_groups))
Out[49]:
In [50]:
d = {k: len(v) for k, v in text_groups.items()}
s = sorted(d.items(), key=operator.itemgetter(1), reverse=True)
In [51]:
"Percent of data set covered by non-singleton combos: {0}".format(
sum([count for loc, count in s[:2000000]]) / float(len(text_df)))
Out[51]:
In [52]:
def prob_dist(d):
probs = {}
for k, v in d.items():
probs.setdefault(v, 0)
probs[v] += 1
return probs
probs = prob_dist(d)
In [56]:
probs.items()[5]
Out[56]:
In [55]:
plt.xscale("log")
plt.yscale("log")
plt.scatter(probs.keys(), probs.values())
Out[55]:
Problematic!
Don Quixote de la Mancha, Cervantes, Miguel de
Rimas, Lope de Vega
String comparisons?
In [58]:
num_texts = 8682520
(num_texts * (num_texts - 1)) / 2
Out[58]:
How many duplicates are there...really?
In [ ]: