In [1]:
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from pprint import pprint
import seaborn as sns
import pandas as pd
import numpy as np
import langdetect
import requests
import chardet
% matplotlib inline
In [2]:
df = pd.read_csv('data/161207_ZikaLabels.csv')
df.head()
Out[2]:
In [3]:
df[df.zika==True].head(100)
Out[3]:
In [4]:
df.shape
Out[4]:
In [5]:
df.isnull().sum()
Out[5]:
In [6]:
print df.locale.value_counts()
print '#'*35
print df.diagnosisCODED.value_counts()
In [7]:
plt.figure(figsize=(5,5))
df.zika.value_counts().plot(kind='bar',title='Zika Distribution')
plt.show()
In [8]:
df.diagnosisCODED.value_counts().plot(kind='bar')
plt.show()
In [9]:
encoding_count = {}
for text in df.diagnosisRAW:
try:
encoding = chardet.detect(text)['encoding']
if encoding in encoding_count.keys():
encoding_count[encoding] += 1
else:
encoding_count[encoding] = 1
except:
pass
In [10]:
chardet.detect('é')
Out[10]:
In [11]:
pprint(encoding_count)
In [12]:
# webscrap for language abbreviations
URL = 'http://www.lingoes.net/en/translator/langcode.htm'
response = requests.get(URL)
soup = BeautifulSoup(response.text,'lxml')
html_tag = 'td' #tags include, but not limited to ['table','tbody','td','tr']
html_text = soup.findAll(html_tag)
lang_table = [website_info.text.encode('utf-8') for website_info in html_text]
lang_abbrevs = np.array(lang_table).reshape(len(lang_table)/2,2) #view data
lang_dict = {}
for row in range(1,lang_abbrevs.shape[0]):
key = lang_abbrevs[row,0]
val = lang_abbrevs[row,1]
lang_dict[key] = val
In [13]:
# frequency of languages utilized
lang_count = {}
for text in df.diagnosisRAW:
try:
lang = langdetect.detect(text.decode('ISO-8859-2').encode('ASCII','ignore'))
lang = lang.encode('ASCII')
key = lang_dict[lang]
if key in lang_count.keys():
lang_count[key]+=1
else:
lang_count[key]=1
except:
pass
In [14]:
pprint(lang_count)