In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)
In [3]:
df = pd.read_csv(os.path.join(DATA_PATH, 'all_tweets.csv.gz'))
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)
Out[3]:
In [5]:
with gzip.open(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), 'rb') as f:
df = pd.read_csv(f, encoding='utf8', low_memory=False)
In [6]:
mask = np.array([bool('date' in c) for c in df.columns])
df.columns[mask]
Out[6]:
In [7]:
df.columns[np.array([bool('time' in c) for c in df.columns])]
Out[7]:
Remember any date or time columns from Dan's tutorial?
In [8]:
mask = np.array([c.endswith('_at') for c in df.columns])
df.columns[mask]
Out[8]:
In [9]:
dates = df[df.columns[mask]]
dates
Out[9]:
In [10]:
dates = pd.DataFrame(index=df.index)
for col in df.columns[np.array([bool(c.endswith('_at')) for c in df.columns])]:
print(col)
dates[col] = pd.to_datetime(df[col])
In [11]:
dates
Out[11]:
In [38]:
# import gzip
# with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'wb') as f:
# dates.to_csv(f)
# # dates.to_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), compression='gzip')
In [39]:
%ls -thal DATA_PATH
!ls -thal data
In [40]:
system("ls %s" % DATA_PATH)
Out[40]:
In [41]:
ls -thal ../../data
In [12]:
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
for col in dates.columns:
print(col)
dates[col] = pd.to_datetime(dates[col])
In [13]:
dates.created_at.dt.hour
Out[13]:
In [14]:
dates.created_at.dt.weekday
Out[14]:
In [15]:
dow = pd.Series(dates.created_at.dt.weekday)
dow.hist(bins=[0,1,2,3,4,5,6,7])
plt.xlabel('0=Mon 6=Sun')
plt.ylabel('Tweets')
In [18]:
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
In [19]:
features = pd.DataFrame({'faves': nums.favorite_count, 'weekday': dow})
Out[19]:
In [24]:
names = pd.Series([''] * len(dow), index=dow.index)
for i, label in enumerate('Mon Tues Wed Thurs Fri Sat Sun'.split()):
names[dow == i] = label
In [33]:
faves_by_dow = pd.Series(Counter(names[nums.favorite_count > 0]))
faves_by_dow
Out[33]:
In [34]:
fave_ratio_by_dow = pd.Series(Counter(names[nums.favorite_count > 0])) / pd.Series(Counter(names))
fave_ratio_by_dow
Out[34]:
Keep in mind, these aren't big differences
And we have a sample bias ("Python" in May)
Now it's your turn
Can you do a similar analysis for Time of Day
Hint: think of a good "bin" size
Use tab-completion on the dt accessor method of the dates you are interested in