In [1]:
import sys
import warnings
from tqdm import tqdm
import csv
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
names = ['id', 'date', 'tags', 'likes']
dtypes = {'id': 'int', 'date': 'str', 'col3': 'str', 'likes': 'int'}
df = pd.read_csv('data.csv', header=None, sep='\t', names=names, dtype=dtypes, parse_dates=['date'])
In [3]:
df.head()
Out[3]:
In [4]:
df.plot(x='date', y='likes')
Out[4]:
In [5]:
all_tags = list(set(tag for tags in df['tags'] for tag in tags.split()))
columns = ['id', 'date', 'likes'] + sorted(all_tags)
In [6]:
df_tags = pd.DataFrame(columns=columns)
In [7]:
for index, row in tqdm(df.iterrows()):
df_tags.loc[index] = [row['id'], row['date'], row['likes']] + [0] * len(all_tags)
for tag in row['tags'].split():
df_tags.at[index, tag] = 1
# if index > 10:
# break
In [38]:
df_tags.head()
Out[38]:
In [49]:
tags_sorted = []
for tag in all_tags:
tags_sorted.append((tag, df_tags[tag].sum()))
tags_sorted.sort(key=lambda x: x[1])
print(tags_sorted[-15:])
tags_frequent = [tag[0] for tag in tags_sorted[-15:]]
In [26]:
df_tags.plot(x='date', y=all_tags, style='o', legend=False)
Out[26]:
In [60]:
def plotMovingAverage(series, n):
"""
series - dataframe with timeseries
n - rolling window size
"""
rolling_mean = series.rolling(window=n).sum()
plt.figure(figsize=(15,5))
plt.title("Moving average\n window size = {}".format(n))
# При желании, можно строить и доверительные интервалы для сглаженных значений
#rolling_std = series.rolling(window=n).std()
#upper_bond = rolling_mean+1.96*rolling_std
#lower_bond = rolling_mean-1.96*rolling_std
#plt.plot(upper_bond, "r--", label="Upper Bond / Lower Bond")
#plt.plot(lower_bond, "r--")
#plt.plot(series[n:], label="Actual values")
plt.plot(rolling_mean, 'o')
plt.legend(loc="upper left")
plt.grid(True)
In [52]:
plotMovingAverage(df_tags[all_tags], 30)
In [51]:
plotMovingAverage(df_tags[all_tags], 90)
In [37]:
plotMovingAverage(df_tags[all_tags], 365)
In [57]:
plotMovingAverage(df_tags[tags_frequent], 30)
In [58]:
plotMovingAverage(df_tags[tags_frequent], 90)
In [59]:
plotMovingAverage(df_tags[tags_frequent], 365)
In [15]:
df_tags['mon'] = df_tags['date'].month
df_tags_monsum = df_tags.groupby('mon').aggregate(sum)
In [ ]:
In [53]:
plotMovingAverage(df['likes'][:5000], 7)
In [55]:
plotMovingAverage(df['likes'][:500], 30)
In [49]:
plotMovingAverage(df['likes'], 90)
In [50]:
plotMovingAverage(df['likes'], 365)
In [ ]:
def weighted_average(series, weights):
result = 0.0
weights.reverse()
for n in range(len(weights)):
result += series[-n-1] * weights[n]
return result