In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
poll_data = pd.read_csv('../data/mlcourse_open_first_survey_parsed.csv',
index_col='id')
In [3]:
poll_data.head(2)
Out[3]:
In [4]:
poll_data.info()
In [5]:
poll_data.shape
Out[5]:
Переименуем признаки.
In [6]:
rename_dic = dict(zip(poll_data.columns,
['time', 'intro', 'jupyter', 'expect',
'sport', 'personal', 'math_level', 'quest1',
'quest2', 'quest3', 'fib_gen', 'python',
'github', 'movie']))
In [7]:
poll_data.rename(columns=rename_dic, inplace=True)
In [8]:
poll_data.head(2)
Out[8]:
In [9]:
poll_data['jupyter'].value_counts().head()
Out[9]:
In [10]:
jupyer_levels = poll_data['jupyter'].map({'Да': 1,
'Нет': 0}).fillna(2)
In [11]:
jupyer_levels.head()
Out[11]:
In [12]:
sns.countplot(jupyer_levels);
In [13]:
poll_data.columns
Out[13]:
In [14]:
poll_data.sport.value_counts().head()
Out[14]:
Процент ответивших на 3 вопрос
In [15]:
(poll_data['quest3'] == 2).dropna().sum() / poll_data['quest3'][~poll_data['quest3'].isnull()].shape[0]
Out[15]:
Ищем топ-10 слов в представлении себя. При этом уберем стоп-слова
In [16]:
all_words = np.concatenate(poll_data['intro'].apply(lambda s:
s.strip().split()).values)
In [17]:
len(all_words)
Out[17]:
In [18]:
import nltk
In [19]:
nltk.download('stopwords')
Out[19]:
In [20]:
nltk.corpus.stopwords.words('russian')[:10]
Out[20]:
In [21]:
words_series = pd.Series(list(all_words))
words_series[~words_series.isin(
nltk.corpus.stopwords.words('russian'))].value_counts().head(10)
Out[21]: