In [29]:
import pandas as pd
import matplotlib.pylab as plt
import io
import requests
import seaborn as sns
import pandas_profiling
import missingno as msno
%matplotlib inline

In [16]:
PROCESSED_DATA_URL = "https://raw.githubusercontent.com/BuzzFeedNews/H-2-certification-data/master/data/processed/H-2-certification-decisions.csv"

In [19]:
s=requests.get(PROCESSED_DATA_URL).content
immigration_df=pd.read_csv(io.StringIO(s.decode('utf-8')))


/home/yassine/Envs/data-science/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (0,8,9,14,15,16,17,18,20,21) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

A general exploration of the immigration data


In [21]:
immigration_df.head()


Out[21]:
case_no visa_type fy last_event_date case_status n_requested n_certified is_certified certification_begin_date certification_end_date ... employer_state employer_city employer_address_1 employer_address_2 employer_postal_code worksite_state worksite_city agent_name organization_flag is_duplicate
0 9455254 H-2B 2000 1999-10-01 DENIED, TEMPORARY 1 0 False NaN NaN ... CA LOS ANGELES NaN NaN NaN NaN NaN NaN NaN NaN
1 8222219 H-2B 2000 1999-10-04 CERTIFIED, TEMPORARY 2 2 True NaN NaN ... CO BEAVER CREEK NaN NaN NaN NaN NaN NaN NaN NaN
2 8222168 H-2B 2000 1999-10-04 CERTIFIED, TEMPORARY 2 2 True NaN NaN ... CO MT. CRESTED BUTTE NaN NaN NaN NaN NaN NaN NaN NaN
3 8222169 H-2B 2000 1999-10-04 CERTIFIED, TEMPORARY 5 5 True NaN NaN ... CO MT. CRESTED BUTTE NaN NaN NaN NaN NaN NaN NaN NaN
4 8222170 H-2B 2000 1999-10-04 CERTIFIED, TEMPORARY 40 40 True NaN NaN ... CO MT. CRESTED BUTTE NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 22 columns


In [31]:
immigration_report = pandas_profiling.ProfileReport(immigration_df)

In [32]:
immigration_report.to_file('immigration_data_exploration_report.html')

In [30]:
msno.matrix(immigration_df)



In [ ]: