In [17]:
import sys
import warnings
from tqdm import tqdm
import csv
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import create_engine, Table, Column, Integer, String, Boolean, DateTime, Date, MetaData, ForeignKey
from sqlalchemy.engine.url import URL
from sqlalchemy.orm import sessionmaker
from sqlalchemy.pool import NullPool
local_name = 'sqlite:///../corpus/stories.sqlite'
engine = create_engine(local_name, echo=True)
%matplotlib inline
In [118]:
df = pd.read_sql_query('select * from stories', engine)
In [119]:
print(df.shape)
df.head()
Out[119]:
In [234]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
In [235]:
df_tags_likes = df[['tags', 'likes']]
print(df_tags_likes.shape)
df_tags_likes[:10]
Out[235]:
In [248]:
from sklearn.preprocessing import Binarizer
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X, y = vectorizer.fit_transform(df_tags_likes['tags']), df_tags_likes['likes'].values
# one_hot_df = pd.DataFrame(X.toarray(), columns=sorted(vectorizer.vocabulary_, key=lambda k: vectorizer.vocabulary_[k]))
In [249]:
# one_hot_df['likes'] = df_tags_likes['likes']
# one_hot_df.shape
X, y
Out[249]:
In [252]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [253]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
In [254]:
lm.score(X_test, y_test)
Out[254]:
In [258]:
df_text_likes = df[['text', 'likes']]
print(df_text_likes.shape)
vectorizer = CountVectorizer()
X, y = vectorizer.fit_transform(df_text_likes['text']), df_text_likes['likes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
lm.score(X_test, y_test)
Out[258]:
In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer
df_text_likes = df[['text', 'likes']]
print(df_text_likes.shape)
vectorizer = TfidfVectorizer()
X, y = vectorizer.fit_transform(df_text_likes['text']), df_text_likes['likes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
lm.score(X_test, y_test)
Out[261]:
In [ ]: