In [1]:
from setup import *
import sys
if DATA_PATH not in sys.path: sys.path.append(DATA_PATH)
from constants import *
%matplotlib inline
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_columns', 200)
In [3]:
import matplotlib
from IPython.display import display, HTML
%matplotlib inline
np = pd.np
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_rows', 6)
pd.set_option('display.max_columns', 250)
%pprint
In [6]:
df = pd.read_csv(os.path.join(DATA_PATH, 'deduped_tweets.csv.gz'), low_memory=False)
rawlen = len(df)
df.drop_duplicates('id_str', keep='last', inplace=True)
rawlen - len(df)
Out[6]:
In [7]:
# df.get_
df.get_dtype_counts()
Out[7]:
In [8]:
dtypes = pd.Series([df[v].dtype for v in df.columns], index=df.columns)
dtypes
Out[8]:
In [9]:
mask = [col for col in df.columns if df[col].dtype in (bool, float, np.dtype('int64'))] # don't forget bool and int64 (not int)!
mask
numbers = df[mask]
numbers
Out[9]:
In [8]:
# import gzip
# with gzip.open(os.path.join(DATA_PATH, 'numbers.csv.gz'), 'wb') as f:
# numbers.to_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), encoding='utf-8')
In [1]:
numbers = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), compression='gzip', engine='python')
In [7]:
[col for col in numbers.columns if 'follow' in col]
Out[7]:
In [11]:
numbers.columns = [col.replace(' ', '_') for col in numbers.columns]
In [12]:
cols = [col for col in numbers.columns if 'follow' in col]
Out[12]:
In [19]:
numbers.user_followers_count.hist()
plt.yscale('log', noposy='clip')
plt.ylabel('Tweets')
plt.xlabel('Followers')
Out[19]:
In [20]:
# group by user ID before doing plots based on user stats like followers
In [ ]: