In [13]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
In [14]:
# preprocessing functions
def one_hot (sentence):
vocabulary = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я',
'А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О','П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я',
'!','@','#','$','%','^','&','*','(',')',':',';','/',',','.','%','№','?','~','-','+','=',' ',
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
result = np.zeros((1,len(vocabulary)))
for char in sentence:
if char in vocabulary:
vector = np.zeros((1,len(vocabulary)))
vector[0, vocabulary.index(char)] = 1
result = np.concatenate((result, vector))
result = np.delete(result, (0), axis=0)
#encoded = one_hot(sentence)
num_of_lett = 300
if len(result)<num_of_lett:
result = np.concatenate((result, np.zeros((num_of_lett-result.shape[0],result.shape[1]))))
if len(result)>num_of_lett:
result = result[:num_of_lett,:]
return result
def char_to_vocab (sentence):
vocabulary = ['а','б','в','г','д','е','ё','ж','з','и','й','к','л','м','н','о','п','р','с','т','у','ф','х','ц','ч','ш','щ','ъ','ы','ь','э','ю','я',
'А','Б','В','Г','Д','Е','Ё','Ж','З','И','Й','К','Л','М','Н','О','П','Р','С','Т','У','Ф','Х','Ц','Ч','Ш','Щ','Ъ','Ы','Ь','Э','Ю','Я',
'!','@','#','$','%','^','&','*','(',')',':',';','/',',','.','%','№','?','~','-','+','=',' ',
'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
result = []
for char in sentence:
if char in vocabulary:
result.append(vocabulary.index(char)+1)
result = np.array(result)
#encoded = one_hot(sentence)
num_of_lett = 300
if len(result)<num_of_lett:
result = np.concatenate((result, np.zeros(num_of_lett-result.shape[0])))
if len(result)>num_of_lett:
result = result[:num_of_lett]
return list(result)
#char_to_vocab(X_train[1])
#X_train[0]
In [11]:
data = pd.read_csv('Dataset/dataset_raw.csv')
plt.hist(data.likes,bins = 150)
plt.show()
data.likes.mean()
data[data.likes < 10000].
Out[11]:
In [56]:
data.likes = data.likes/data.likes.max()
pic_names = os.listdir('Dataset/images/')
msk = np.random.rand(len(pic_names)) < 0.8
train_names = np.array(pic_names)[msk]
val_names = np.array(pic_names)[~msk]
msk = np.random.rand(len(val_names)) < 0.5
test_names = val_names[msk]
val_names = val_names[~msk]
print(train_names.shape)
print(val_names.shape)
print(test_names.shape)
In [57]:
#data_lett = pd.read_csv('Dataset/dataset_raw.csv')
X = data.texts.values
#np.isnan(np.nan)
X_test = []
y_test = []
id_test = []
for name in test_names:
cache = data.iloc[int(name[:-4])]
if type(cache.texts) == type('a'):
id_test.append(name)
X_test.append(cache.texts)
y_test.append(cache.likes)
X_val = []
y_val = []
id_val = []
for name in val_names:
cache = data.iloc[int(name[:-4])]
if type(cache.texts) == type('a'):
id_val.append(name)
X_val.append(cache.texts)
y_val.append(cache.likes)
X_train = []
y_train = []
id_train = []
for name in train_names:
cache = data.iloc[int(name[:-4])]
if type(cache.texts) == type('a'):
id_train.append(name)
X_train.append(cache.texts)
y_train.append(cache.likes)
print('train dataset: '+str(len(id_train)))
print('val dataset: '+str(len(id_val)))
print('test dataset: '+str(len(id_test)))
# X_ = []
# y_ = []
# for i in range(len(X)):
# if type(X[i])==type('a'):
# X_.append(X[i])
# y_.append(y[i])
# print(len(X_))
# print(len(y_))
In [ ]:
len_X =[]
for i in X:
len_X.append(len(i))
plt.hist(len_X, bins =100)
plt.show()
In [58]:
X_train_encoded = []
for sentence in tqdm(X_train):
X_train_encoded.append(char_to_vocab(sentence))
#X_train_encoded = np.array(X_train_encoded)
X_val_encoded = []
for sentence in tqdm(X_val):
X_val_encoded.append(char_to_vocab(sentence))
#X_val_encoded = np.array(X_val_encoded)
X_test_encoded = []
for sentence in tqdm(X_test):
X_test_encoded.append(char_to_vocab(sentence))
#X_test_encoded = np.array(X_test_encoded)
In [47]:
X_val_encoded
Out[47]:
In [59]:
data_test = {'id':id_test, 'texts':X_test, 'likes':y_test}
dataset_test = pd.DataFrame(data_test)
with open ('Dataset/dataset_test.pkl', 'wb') as f:
pickle.dump(dataset_test, f)
#dataset_test.to_csv('Dataset/dataset_test.csv', encoding='utf-8')
data_val = {'id':id_val, 'texts':X_val, 'likes':y_val}
dataset_val = pd.DataFrame(data_val)
with open ('Dataset/dataset_val.pkl', 'wb') as f:
pickle.dump(dataset_val, f)
#dataset_val.to_csv('Dataset/dataset_val.csv',encoding='utf-8')
data_train = {'id':id_train, 'texts':X_train, 'likes':y_train}
dataset_train = pd.DataFrame(data_train)
with open ('Dataset/dataset_train.pkl', 'wb') as f:
pickle.dump(dataset_train, f)
#dataset_train.to_csv('Dataset/dataset_train.csv',encoding='utf-8')
In [52]:
dataset_train.texts.values[0]
Out[52]: