In [1]:
import re
import pandas as pd
import numpy as np
df = pd.read_csv('data/job_market.csv')
df = df.drop(['link', 'description', 'page_number'], axis=1)
%matplotlib inline
In [2]:
df.location.value_counts().head(10)
Out[2]:
In [3]:
top_locations = df.location.value_counts().head(10)
df2 = df[df.location.isin(top_locations.keys())]
grouped = df2.groupby('location')
grouped.applications.sum() / grouped.id.count()
Out[3]:
In [4]:
df2.groupby('location').salary_max.mean()
Out[4]:
In [5]:
df3 = df.query("applications == 0")[['title', 'location', 'salary_min', 'salary_max', 'found', 'published']]
df3.head()
Out[5]:
In [6]:
df3.describe()
Out[6]:
In [7]:
df3['daysOn'] = df3.found.astype(np.datetime64) - df3.published.astype(np.datetime64)
df3.head()
Out[7]:
In [8]:
df3['daysOnInt'] = df3['daysOn'].apply(lambda x: np.timedelta64(x, 'D').astype(int))
df3.daysOnInt.describe()
Out[8]:
In [9]:
df3.location.value_counts().head(10)
Out[9]: