In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import join
import matplotlib.pyplot as plt
%matplotlib inline
In [4]:
def process_csv(file_path):
data = pd.read_csv(file_path, encoding='utf-8')
tokens = data.token
positions = data.pos
sentences = data.sentence
norms = data.norm
for word in tokens:
if words_dict.get(word) is None:
words_dict[word] = 0
else:
words_dict[word] += 1
for norm in norms:
if norms_dict.get(norm) is None:
norms_dict[norm] = 0
else:
norms_dict[norm] += 1
In [2]:
markup_path = './markup'
In [3]:
words_dict = {}
norms_dict = {}
In [5]:
for folder in listdir(markup_path):
print folder,
folder_path = join(markup_path, folder)
for data_file in listdir(folder_path):
file_path = join(folder_path, data_file)
process_csv(file_path)
In [6]:
sorted_norms = sorted(norms_dict.items(), key=lambda (k, v): v, reverse=True)
In [14]:
for norm in sorted_norms[:100]:
print norm[0],'(',norm[1],') ',
In [31]:
x = []
for norm in sorted_norms[:100]:
x.append(norm[1])
plt.figure(figsize=(16,8))
plt.xlabel(u'Ранг')
plt.ylabel(u'Частота')
plt.title(u'100 самых счастотных лексем')
plt.plot(x,'ro')
plt.show()
In [21]:
sorted_words = sorted(words_dict.items(), key=lambda (k, v): v, reverse=True)
In [22]:
for word in sorted_words[:100]:
print word[0],'(',word[1],') ',
In [32]:
x = []
for word in sorted_words[:100]:
x.append(word[1])
plt.figure(figsize=(16,8))
plt.xlabel(u'Ранг')
plt.ylabel(u'Частота')
plt.title(u'100 самых счастотных словоформ')
plt.plot(x, 'ro')
plt.show()
In [33]:
war_path = './war'
In [34]:
words_dict = {}
norms_dict = {}
In [35]:
for data_file in listdir(war_path):
print data_file,
file_path = join(war_path, data_file)
process_csv(file_path)
In [36]:
sorted_norms = sorted(norms_dict.items(), key=lambda (k, v): v, reverse=True)
sorted_words = sorted(words_dict.items(), key=lambda (k, v): v, reverse=True)
In [37]:
for norm in sorted_norms[:100]:
print norm[0],'(',norm[1],') ',
In [38]:
x = []
for norm in sorted_norms[:100]:
x.append(norm[1])
plt.figure(figsize=(16,8))
plt.xlabel(u'Ранг')
plt.ylabel(u'Частота')
plt.title(u'100 самых счастотных лексем')
plt.plot(x,'ro')
plt.show()
In [39]:
for word in sorted_words[:100]:
print word[0],'(',word[1],') ',
In [40]:
x = []
for word in sorted_words[:100]:
x.append(word[1])
plt.figure(figsize=(16,8))
plt.xlabel(u'Ранг')
plt.ylabel(u'Частота')
plt.title(u'100 самых счастотных словоформ')
plt.plot(x, 'ro')
plt.show()