notebook.community

Edit and run



In [1]:

    
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *

%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)



In [3]:

    
df = pd.read_csv(os.path.join(DATA_PATH, 'all_tweets.csv.gz'))
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)









    



/home/hobs/.virtualenvs/AgileMachineLearning/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2705: DtypeWarning: Columns (1,2,11,12,86,131,143,144,145,146,147,148,149,150,151,152,153,159,160,161,162,163,164,165,169,170,173,174,175,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,194,195,197) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[3]:





10814



In [5]:

    
with gzip.open(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), 'rb') as f:
    df = pd.read_csv(f, encoding='utf8', low_memory=False)



In [6]:

    
mask = np.array([bool('date' in c) for c in df.columns])
df.columns[mask]









    Out[6]:





Index([], dtype='object')



In [7]:

    
df.columns[np.array([bool('time' in c) for c in df.columns])]









    Out[7]:





Index([], dtype='object')

Remember any date or time columns from Dan's tutorial?



In [8]:

    
mask = np.array([c.endswith('_at') for c in df.columns])
df.columns[mask]









    Out[8]:





Index([], dtype='object')



In [9]:

    
dates = df[df.columns[mask]]
dates









    Out[9]:






  
    
      
    
  
  
    
      0
    
    
      1
    
    
      ...
    
    
      183068
    
    
      183069
    
  

183070 rows × 0 columns



In [10]:

    
dates = pd.DataFrame(index=df.index)
for col in df.columns[np.array([bool(c.endswith('_at')) for c in df.columns])]:
    print(col)
    dates[col] = pd.to_datetime(df[col])



In [11]:

    
dates









    Out[11]:






  
    
      
    
  
  
    
      0
    
    
      1
    
    
      ...
    
    
      183068
    
    
      183069
    
  

183070 rows × 0 columns



In [38]:

    
# import gzip
# with gzip.open(os.path.join(DATA_PATH, 'datetimes.csv.gz'), 'wb') as f:
#     dates.to_csv(f)
#     # dates.to_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), compression='gzip')



In [39]:

    
%ls -thal DATA_PATH
!ls -thal data









    



ls: cannot access 'DATA_PATH': No such file or directory
ls: cannot access 'data': No such file or directory



In [40]:

    
system("ls %s" % DATA_PATH)









    Out[40]:





['/bin/bash: ls %s: command not found']



In [41]:

    
ls -thal ../../data









    



ls: cannot access '../../data': No such file or directory



In [12]:

    
dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')

for col in dates.columns:
    print(col)
    dates[col] = pd.to_datetime(dates[col])









    



Unnamed: 0
created_at
quoted_status_created_at
quoted_status_user_created_at
retweeted_status_created_at
retweeted_status_quoted_status_created_at
retweeted_status_quoted_status_user_created_at
retweeted_status_user_created_at
user_created_at



In [13]:

    
dates.created_at.dt.hour









    Out[13]:





0         16
1         16
          ..
183068    16
183069    16
Name: created_at, dtype: int64



In [14]:

    
dates.created_at.dt.weekday









    Out[14]:





0         6
1         6
         ..
183068    6
183069    6
Name: created_at, dtype: int64



In [15]:

    
dow = pd.Series(dates.created_at.dt.weekday)
dow.hist(bins=[0,1,2,3,4,5,6,7])
plt.xlabel('0=Mon    6=Sun')
plt.ylabel('Tweets')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-dc8e93faa503> in <module>()
      1 dow = pd.Series(dates.created_at.dt.weekday)
      2 dow.hist(bins=[0,1,2,3,4,5,6,7])
----> 3 plt.xlabel('0=Mon    6=Sun')
      4 plt.ylabel('Tweets')

NameError: name 'plt' is not defined



In [18]:

    
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')



In [19]:

    
features = pd.DataFrame({'faves': nums.favorite_count, 'weekday': dow})









    Out[19]:






  
    
      
      faves
      weekday
    
  
  
    
      0
      0
      6
    
    
      1
      0
      6
    
    
      2
      1
      6
    
    
      ...
      ...
      ...
    
    
      183067
      0
      6
    
    
      183068
      0
      6
    
    
      183069
      0
      6
    
  

183070 rows × 2 columns



In [24]:

    
names = pd.Series([''] * len(dow), index=dow.index)
for i, label in enumerate('Mon Tues Wed Thurs Fri Sat Sun'.split()):
    names[dow == i] = label



In [33]:

    
faves_by_dow = pd.Series(Counter(names[nums.favorite_count > 0]))
faves_by_dow









    Out[33]:





Fri      4925
Mon      4945
Sat      4591
Sun      4971
Thurs    4399
Tues     5131
Wed      4330
dtype: int64



In [34]:

    
fave_ratio_by_dow = pd.Series(Counter(names[nums.favorite_count > 0])) / pd.Series(Counter(names))
fave_ratio_by_dow









    Out[34]:





Fri      0.194780
Mon      0.182197
Sat      0.177547
Sun      0.177074
Thurs    0.187447
Tues     0.171479
Wed      0.185654
dtype: float64

Keep in mind, these aren't big differences
And we have a sample bias ("Python" in May)

Now it's your turn
Can you do a similar analysis for Time of Day
Hint: think of a good "bin" size
Use tab-completion on the dt accessor method of the dates you are interested in