In [1]:
from planet4 import io
from planet4.region_data import Inca
read the fast_read hdf
In [2]:
uncleaned = "/Users/klay6683/local_data/2016-11-21_planet_four_classifications_queryable.h5"
In [4]:
data = pd.read_hdf(uncleaned, 'df')
In [5]:
c_id = '50ef419195e6e40eac000001'
In [6]:
sub = data[data.classification_id==c_id]
sub.shape
Out[6]:
In [7]:
sub
Out[7]:
In [8]:
c_id = '50ef41ea95e6e42e89000001'
sub2 = data[data.classification_id==c_id]
sub2.shape
Out[8]:
In [9]:
sub2
Out[9]:
In [10]:
image_ids = data.image_id.unique()
In [11]:
df = data[data.image_id==image_ids[0]]
In [12]:
df = data[data.image_id=='APF00004y6']
In [13]:
df.info()
In [14]:
def show_duplicates(df):
"df needs to be image_id dataframe."
f = lambda x: len(x.classification_id.unique())
return df.groupby(['user_name']).apply(f).sort_values(ascending=False)
def show_kyles_dupes(df):
"works with all image_name data."
return show_duplicates(df[df.image_id=='APF00004y6']).head()
In [15]:
show_kyles_dupes(data)
Out[15]:
In [16]:
df = data[data.image_id.isin(['APF00004y6', 'APF00003hp'])]
In [17]:
show_kyles_dupes(df)
Out[17]:
In [18]:
df2 = data[data.image_id=='APF00003hp']
In [19]:
show_duplicates(df2).head()
Out[19]:
In [20]:
user2 = df2[df2.user_name=='Kitharode']
First we find the earliest timestamp, each classification_id has in principle its own timestamp, like Kitharode's data above. But for that not-logged-in user, it has two classification_ids with the same time-stamp:
In [21]:
user2 = df2[df2.user_name=='not-logged-in-073dee28bbc9c250d9dc02cb99f4ef93']
In [22]:
user2[user2.created_at==user2.created_at.min()].classification_id.unique()
Out[22]:
Note that above is filtered for data at minimum time!
but simply doing another minimum(), like i did in previous version of filtering for this, should always work
In [23]:
user2[user2.created_at==user2.created_at.min()].classification_id.min()
Out[23]:
For explanation: this is the earliest AND smallest classification_id for this image_id.
In [3]:
from ipyparallel import Client
c = Client()
lbview = c.load_balanced_view()
dview = c.direct_view()
In [5]:
def process_image_name(image_name):
"processes data for image_name (=obsid)"
from pandas import read_hdf
data = read_hdf(dbname, 'df', where='image_name=image_name')
c_ids= []
def process_user_group(g):
c_ids.append(g.loc[g.created_at==g.created_at.min(),'classification_id'].min())
data.groupby(['image_id', 'user_name'], sort=False).apply(process_user_group)
tmp = data.set_index('classification_id').loc[set(c_ids)].reset_index()
return tmp
In [7]:
from planet4.reduction import get_image_names
image_names = get_image_names(uncleaned)
In [8]:
len(image_names)
Out[8]:
In [9]:
todo = image_names
In [10]:
from nbtools import display_multi_progress
In [11]:
results = lbview.map_async(process_image_name, todo)
In [12]:
display_multi_progress(results, todo)
In [13]:
len(results)
Out[13]:
In [22]:
len(results.result())
Out[22]:
In [14]:
all_df = pd.concat(results, ignore_index=True)
In [15]:
all_df.shape
Out[15]:
In [17]:
from pathlib import Path
In [19]:
p = Path(uncleaned)
In [23]:
newname = p.name[:-3]+ '_cleaned.h5'
In [24]:
data_columns = ['classification_id', 'image_id',
'image_name', 'user_name', 'marking',
'acquisition_date', 'local_mars_time']
In [25]:
all_df.to_hdf(p.parent / newname, 'df', format='table', data_columns=data_columns)
In [30]:
db.dbname
Out[30]:
In [ ]:
In [145]:
df = data[data.image_id=='APF00004y6']
In [146]:
df.image_name.unique()
Out[146]:
In [133]:
show_kyles_dupes(process_image_name(df))
Out[133]:
In [134]:
len(process_image_name(df))
Out[134]:
In [135]:
%timeit process_image_name(df)
In [ ]:
In [ ]:
In [ ]:
In [156]:
%timeit process_image_name('ESP_012053_0980')
In [157]:
len(image_names)
Out[157]:
In [151]:
image_names = data.image_name.unique()
In [ ]:
In [ ]:
In [ ]:
filtered_data = df.groupby(['user_name']).apply(process_user_group)
In [ ]:
from planet4 import helper_functions as hf
In [ ]:
def process_image_id(image_id):
df = data[data.image_id==image_id]
n_class = df.classification_id.unique().size
results = hf.classification_counts_per_user(df).value_counts()
if not any(results.index>1):
return n_class, n_class
else:
n_class_real = results[1]
for index in results.index[results.index>1]:
n_class_real += results[index]
return (n_class_real, n_class)
In [ ]:
real_class_percents = []
for image_id in image_ids:
real_class_percents.append(process_image_id(image_id))
In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
In [ ]:
s = pd.DataFrame(real_class_percents, index=image_ids, columns=['real_n_class', 'expected_n_class'])
In [ ]:
s.head()
In [ ]:
s.plot()
In [ ]:
import seaborn as sns
sns.set_context('talk')
In [ ]:
s.head()
In [ ]:
s = s.assign(fraction=s.real_n_class/s.expected_n_class)
In [ ]:
s.fraction.plot(style='.',title='Inca City, season1, fraction of good classifications')
In [ ]:
s.describe()
In [ ]:
df = data[data.image_id=='APF0000zea']
n_class= df.classification_id.unique().size
n_class
results = hf.classification_counts_per_user(df).value_counts()
results
results.index>1
n_class_real = results[1]
for index in results.index[results.index>1]:
n_class_real += results[index]
n_class_real
In [31]:
subdata = data[data.image_id=='APF00003hp']
In [33]:
from planet4 import
In [36]:
subdata.info()
In [37]:
def pug1(g):
c_id = g.sort_values(by='classification_id').classification_id.iloc[0]
return g[g.classification_id == c_id]
In [38]:
def pug1b(g):
c_id = g.sort_values(by='classification_id').classification_id.iloc[0]
return c_id
In [39]:
def pug2(g):
c_id = g.classification_id.min()
return g[g.classification_id == c_id]
In [40]:
def pug2b(g):
c_id = g.classification_id.min()
return c_id
In [ ]:
usergroup = data.groupby(['user_name'], sort=False)
In [ ]:
%timeit usergroup.apply(pug1).reset_index(drop=True)
In [ ]:
%timeit usergroup.apply(pug2).reset_index(drop=True)
In [ ]:
%timeit data[data.classification_id.isin(usergroup.classification_id.min())]
In [ ]:
v1 = usergroup.apply(pug1).reset_index(drop=True).sort_values(by=['classification_id'])
In [ ]:
v2 = usergroup.apply(pug2).reset_index(drop=True).sort_values(by=['classification_id'])
In [ ]:
v3 = data[data.classification_id.isin(usergroup.classification_id.min())]
In [ ]:
(v1.dropna() == v3.dropna()).all()
In [ ]:
v3[v3.classification_id=='50ef44b995e6e42d8c000001']
In [ ]:
(usergroup.apply(pug1).dropna() == usergroup.apply(pug2).dropna()).all()
In [ ]:
v1.info()
In [ ]:
(v1.dropna()==v2.dropna()).all()
In [ ]:
v2.head()
In [ ]:
data.groupby(['user_name','classification_id']).apply(lambda x: len(x.classification_id.unique())).sort_values(ascending=False).min()
In [ ]:
v3 = data[data.classification_id.isin(data.groupby('user_name').classification_id.max())].sort_values(by='classification_id')
In [ ]:
(v1.classification_id.sort_values() == v2.classification_id.sort_values()).all()
In [ ]:
(g.apply(pug1).reset_index(drop=True) == g.apply(pug2).reset_index(drop=True)).all()
In [ ]:
%timeit g.apply(pug2b)
In [ ]:
img_ids = ['APF00003hp']
users = ['not-logged-in-073dee28bbc9c250d9dc02cb99f4ef93']
for img_id, user in zip(img_ids, users):
print("image_id: ", img_id)
print("User: ", user)
data = db.get_image_id_markings(img_id)
print("Before filtering classification_id created_at times:")
print(data[data.user_name==user].created_at.unique())
print("Classification_ids:")
print(data[data.user_name==user].classification_id.unique())
g = data.groupby(['user_name'])
res = g.apply(process_user_group).reset_index(drop=True)
print("After filtering:")
print(res[res.user_name==user].created_at.unique())
print(res[res.user_name==user].classification_id.unique())
print()
In [ ]:
fname = '/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable.h5'
db = io.DBManager()
In [ ]:
db.dbname
In [ ]:
data = pd.read_hdf(db.dbname, 'df',
where="classification_id=='50ef44b795e6e42cd2000001'")
data
In [ ]:
df = pd.read_hdf(db.dbname, 'df')
In [ ]:
df[df.classification_id=='50ef44b795e6e42cd2000001']
In [ ]:
df[df.classification_id=='50ef44b995e6e42d8c000001']
In [ ]:
df[df.classification_id=='50ee0e5694b9d564a90000b5']
In [ ]:
db.dbname
In [ ]:
df.classification_id = df.classification_id.astype('str')
In [ ]:
df.to_hdf('/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable_cleaned.h5',
'df', format='t', data_columns=reduction.data_columns)
In [ ]:
pd.read_hdf('/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable_cleaned.h5',
'df', where="classification_id=='50ef44b795e6e42cd2000001'")
In [ ]:
fname = '/Users/klay6683/data/planet4/2015-10-11_planet_four_classifications_queryable.h5'
In [ ]:
reduction.remove_duplicates_from_file(fname)
In [ ]:
data = pd.read_hdf('testing.h5', 'df',
where="classification_id=='50ef44b995e6e42d8c000001'")
data
In [ ]:
data2 = db.get_class_id_data('50ef44b995e6e42d8c000001')
In [ ]:
s = pd.Series(list('abc'))
In [ ]:
pd.DataFrame(s)
In [ ]:
from planet4 import io
In [ ]:
db=io.DBManager()
In [ ]:
import time
In [ ]:
imgnames = db.season2and3_image_names
In [ ]:
where = "image_name in {}".format(imgnames.values.tolist())
In [ ]:
where
In [ ]:
import time
t0 = time.time()
season23 = pd.read_hdf(db.dbname, 'df', where=where)
t1 = time.time()
print("time: ", t1 - t0)
In [30]:
cats = [ "s%07d" % i for i in range(4000000) ]
df = pd.DataFrame({'A' : cats})
In [31]:
df.info()
In [32]:
In [3]: df['B'] = df['A'].astype('category')
In [4]: df.B.cat.codes.dtype
Out[32]:
In [33]:
for i in range(3):
df.to_hdf('test_{}.h5'.format(i),'df',mode='w',data_columns=True,format='table')
In [36]:
df = []
for i in range(3):
df.append(pd.read_hdf('test_{}.h5'.format(i), 'df'))
In [37]:
df = pd.concat(df, ignore_index=True)
In [38]:
df[df.B=='s0000005']
Out[38]:
In [39]:
df[df.B=='s3999999']
Out[39]:
In [40]:
df.to_hdf('test.h5', 'df', format='table', data_columns=True)
In [41]:
pd.read_hdf('test.h5','df',where='A="s3999999"')
Out[41]:
In [43]:
pd.read_hdf('test.h5','df',where='B="s3999999"')
Out[43]: