1 Raw data stats
2 Read in data
- 2.1 Produce latex table
- 2.2 Add region
3 Calculate number of empty tiles
- 3.1 Create sample to check what's empty
4 highest number of markings per tile
5 Convert distance to meters
- - 5.0.1 Reduction of number of fan markings to finals
6 Length stats
- 6.1 Blotch sizes
- 6.2 Longest fans
7 North azimuths
8 User stats
9 pipeline output examples



In [ ]:

    
%matplotlib ipympl



In [ ]:

    
import seaborn as sns
sns.set()
sns.set_context('paper')
sns.set_palette('colorblind')
from planet4 import io, stats, markings, plotting, region_data
from planet4.catalog_production import ReleaseManager



In [ ]:

    
fans = pd.read_csv("/Users/klay6683/Dropbox/data/planet4/p4_analysis/P4_catalog_v1.0/P4_catalog_v1.0_L1C_cut_0.5_fan_meta_merged.csv")
blotch = pd.read_csv("/Users/klay6683/Dropbox/data/planet4/p4_analysis/P4_catalog_v1.0/P4_catalog_v1.0_L1C_cut_0.5_blotch_meta_merged.csv")



In [ ]:

    
pd.set_option("display.max_columns", 150)



In [ ]:

    
fans.head()



In [ ]:

    
fans.l_s.head().values[0]



In [ ]:

    
group_blotch = blotch.groupby("obsid")



In [ ]:

    
type(group_blotch)



In [ ]:

    
counts = group_blotch.marking_id.count()



In [ ]:

    
counts.head()



In [ ]:

    
counts.plot(c='r')



In [ ]:

    
plt.figure()
counts.hist()



In [ ]:

    
counts.max()



In [ ]:

    
counts.min()



In [ ]:

    
fans.head()



In [ ]:

    
plt.figure(constrained_layout=True)
counts[:20].plot.bar()



In [ ]:

    
plt.figure()
counts[:10].plot(use_index=True)



In [ ]:

    
plt.figure()
counts[:10]



In [ ]:



In [ ]:



In [ ]:

    
grouped = fans.groupby("obsid")



In [ ]:

    
grouped.tile_id.nunique().sort_values(ascending=False).head()



In [ ]:



In [ ]:

    
%matplotlib inline



In [ ]:

    
from planet4.markings import ImageID



In [ ]:

    
p4id = ImageID('7t9')



In [ ]:

    
p4id.image_name



In [ ]:

    
p4id.plot_fans()



In [ ]:

    
filtered = fans[fans.tile_id=='APF0000cia']



In [ ]:

    
filtered.shape



In [ ]:

    
p4id.plot_fans(data=filtered)



In [ ]:

Raw data stats



In [ ]:

    
import dask.dataframe as dd



In [ ]:

    
db = io.DBManager()
db.dbname



In [ ]:

    
df = dd.read_hdf(db.dbname, 'df')



In [ ]:

    
df.columns



In [ ]:

    
grp = df.groupby(['user_name'])



In [ ]:

    
s = grp.classification_id.nunique().compute().sort_values(ascending=False).head(5)



In [ ]:

    
s

Read in data



In [ ]:

    
rm = ReleaseManager('v1.0')



In [ ]:

    
db = io.DBManager()



In [ ]:

    
data = db.get_all()



In [ ]:



In [ ]:

    
fans = pd.read_csv(rm.fan_merged)



In [ ]:

    
fans.shape



In [ ]:

    
fans.columns



In [ ]:

    
from planet4.stats import define_season_column



In [ ]:

    
define_season_column(fans)



In [ ]:

    
fans.columns



In [ ]:



In [ ]:

    
season2 = fans[fans.season==2]



In [ ]:

    
season2.shape



In [ ]:

    
img223 = fans.query("image_name=='ESP_012265_0950'")



In [ ]:

    
img223.shape



In [ ]:

    
plt.figure()
img223.angle.hist()



In [ ]:

    
fans.season.dtype



In [ ]:

    
meta = pd.read_csv(rm.metadata_path, dtype='str')



In [ ]:

    
cols_to_merge = ['OBSERVATION_ID',
                         'SOLAR_LONGITUDE', 'north_azimuth', 'map_scale']



In [ ]:

    
fans = fans.merge(meta[cols_to_merge], left_on='obsid', right_on='OBSERVATION_ID')



In [ ]:

    
fans.drop(rm.DROP_FOR_FANS, axis=1, inplace=True)



In [ ]:

    
fans.image_x.head()



In [ ]:

    
ground['image_x'] = pd.to_numeric(ground.image_x)



In [ ]:

    
ground['image_y'] = pd.to_numeric(ground.image_y)



In [ ]:

    
fans_new = fans.merge(ground[rm.COLS_TO_MERGE], on=['obsid', 'image_x', 'image_y'])



In [ ]:

    
fans_new.shape



In [ ]:

    
fans.shape



In [ ]:

    
s = pd.to_numeric(ground.BodyFixedCoordinateX)



In [ ]:

    
s.head()



In [ ]:

    
s.round(decimals=4)



In [ ]:



In [ ]:



In [ ]:

    
blotches = rm.read_blotch_file().assign(marking='blotch')
fans = rm.read_fan_file().assign(marking='fan')
combined = pd.concat([blotches, fans], ignore_index=True)
blotches.head()

Produce latex table



In [ ]:

    
fans.columns



In [ ]:

    
cols1 = fans.columns[:13]
print(cols1)
cols2 = fans.columns[13:-4]
print(cols2)
cols3 = fans.columns[-4:-1]
cols3



In [ ]:

    
fanshead1 = fans[cols1].head(10)
fanshead2 = fans[cols2].head(10)
fanshead3 = fans[cols3].head(10)



In [ ]:

    
with open("fan_table1.tex", 'w') as f:
    f.write(fanshead1.to_latex())



In [ ]:

    
with open("fan_table2.tex", 'w') as f:
    f.write(fanshead2.to_latex())



In [ ]:

    
with open("fan_table3.tex", 'w') as f:
    f.write(fanshead3.to_latex())

Add region

Adding a region identifier, immensely helpful in automatically plotting stuff across regions.



In [ ]:

    
for Reg in region_data.regions:
    reg = Reg()
    print(reg.name)
    combined.loc[combined.obsid.isin(reg.all_obsids), 'region'] = reg.name
    fans.loc[fans.obsid.isin(reg.all_obsids), 'region']= reg.name
    blotches.loc[blotches.obsid.isin(reg.all_obsids), 'region'] = reg.name

Calculate number of empty tiles



In [ ]:

    
tiles_marked = combined.tile_id.unique()



In [ ]:

    
db = io.DBManager()



In [ ]:

    
input_tiles = db.image_ids
input_tiles.shape[0]



In [ ]:

    
n_empty = input_tiles.shape[0] - tiles_marked.shape[0]
n_empty



In [ ]:

    
n_empty / input_tiles.shape[0]



In [ ]:

    
empty_tiles = list(set(input_tiles) - set(tiles_marked))



In [ ]:

    
all_data = db.get_all()



In [ ]:

    
all_data.set_index('image_id', inplace=True)



In [ ]:

    
empty_data = all_data.loc[empty_tiles]



In [ ]:

    
meta = pd.read_csv(rm.metadata_path)
meta.head()



In [ ]:

    
empty_tile_numbers = empty_data.reset_index().groupby('image_name')[['x_tile', 'y_tile']].max()



In [ ]:

    
empty_tile_numbers['total'] = empty_tile_numbers.x_tile*empty_tile_numbers.y_tile



In [ ]:

    
empty_tile_numbers.head()



In [ ]:

    
n_empty_per_obsid = empty_data.reset_index().groupby('image_name').image_id.nunique()



In [ ]:

    
n_empty_per_obsid = n_empty_per_obsid.to_frame()



In [ ]:

    
n_empty_per_obsid.columns = ['n']



In [ ]:

    
df = n_empty_per_obsid



In [ ]:

    
df = df.join(empty_tile_numbers.total)



In [ ]:

    
df = df.assign(ratio=df.n/df.total)



In [ ]:

    
df = df.join(meta.set_index('OBSERVATION_ID'))



In [ ]:

    
df['scaled_n'] = df.n / df.map_scale / df.map_scale



In [ ]:

    
import seaborn as sns
sns.set_context('notebook')



In [ ]:

    
df.plot(kind='scatter', y='ratio', x='SOLAR_LONGITUDE')
ax = plt.gca()
ax.set_ylabel('Fraction of empty tiles per HiRISE image')
ax.set_xlabel('Solar Longitude [$^\circ$]')
ax.set_title("Distribution of empty tiles vs time")
plt.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/empty_data_vs_ls.pdf")



In [ ]:

    
df[df.ratio > 0.8]

Create sample to check what's empty



In [ ]:

    
sample = np.random.choice(empty_tiles, 200)



In [ ]:

    
cd plots



In [ ]:

    
from tqdm import tqdm



In [ ]:

    
for image_id in tqdm(sample):
    fig, ax = plt.subplots(ncols=2)
    plotting.plot_raw_fans(image_id, ax=ax[0])
    plotting.plot_raw_blotches(image_id, ax=ax[1])
    fig.savefig(f"empty_tiles/{image_id}_input_markings.png", dpi=150)
    plt.close('all')

highest number of markings per tile



In [ ]:

    
fans_per_tile = fans.groupby('tile_id').size().sort_values(ascending=False)
fans_per_tile.head()



In [ ]:

    
blotches_per_tile = blotches.groupby('tile_id').size().sort_values(ascending=False)
blotches_per_tile.head()



In [ ]:

    
print(fans_per_tile.median())
blotches_per_tile.median()



In [ ]:

    
plt.close('all')



In [ ]:

    
by_image_id = combined.groupby(['marking', 'tile_id']).size()



In [ ]:

    
by_image_id.name = 'Markings per tile'



In [ ]:

    
by_image_id = by_image_id.reset_index()



In [ ]:

    
by_image_id.columns



In [ ]:

    
g = sns.FacetGrid(by_image_id, col="marking", aspect=1.2)
bins = np.arange(0, 280, 5)
g.map(sns.distplot, 'Markings per tile', kde=False, bins=bins, hist_kws={'log':True})
plt.savefig('/Users/klay6683/Dropbox/src/p4_paper1/figures/number_distributions.pdf', dpi=150)



In [ ]:

    
blotches_per_tile.median()



In [ ]:

    
from planet4 import plotting



In [ ]:

    
# %load -n plotting.plot_finals_with_input
def plot_finals_with_input(id_, datapath=None, horizontal=True, scope='planet4'):
    imgid = markings.ImageID(id_, scope=scope)
    pm = io.PathManager(id_=id_, datapath=datapath)
    if horizontal is True:
        kwargs = {'ncols': 2}
    else:
        kwargs = {'nrows': 2}
    fig, ax = plt.subplots(figsize=(4,5), **kwargs)
    ax[0].set_title(imgid.imgid, fontsize=8)
    imgid.show_subframe(ax=ax[0])
    for marking in ['fan', 'blotch']:
        try:
            df = getattr(pm, f"final_{marking}df")
        except:
            continue
        else:
            data = df[df.image_id == imgid.imgid]
            imgid.plot_markings(marking, data, ax=ax[1])
    fig.subplots_adjust(top=0.95,bottom=0, left=0, right=1, hspace=0.01, wspace=0.01)
    fig.savefig(f"/Users/klay6683/Dropbox/src/p4_paper1/figures/{imgid.imgid}_final.png",
                dpi=150)



In [ ]:

    
plot_finals_with_input('7t9', rm.savefolder, horizontal=False)



In [ ]:

    
markings.ImageID('7t9').image_name

Convert distance to meters



In [ ]:

    
fans['distance_m'] = fans.distance*fans.map_scale

blotches['radius_1_m'] = blotches.radius_1*blotches.map_scale
blotches['radius_2_m'] = blotches.radius_2*blotches.map_scale

Reduction of number of fan markings to finals



In [ ]:

    
n_fan_in = 2792963



In [ ]:

    
fans.shape[0]



In [ ]:

    
fans.shape[0] / n_fan_in

Length stats

Percentage of fan markings below 100 m:



In [ ]:

    
import scipy
scipy.stats.percentileofscore(fans.distance_m, 100)

Cumulative histogram of fan lengths



In [ ]:

    
def add_percentage_line(ax, meters, column):
    y = scipy.stats.percentileofscore(column, meters)
    ax.axhline(y/100, linestyle='dashed', color='black', lw=1)
    ax.axvline(meters, linestyle='dashed', color='black', lw=1)
    ax.text(meters, y/100, f"{y/100:0.2f}")



In [ ]:

    
plt.close('all')



In [ ]:

    
fans.distance_m.max()



In [ ]:

    
bins = np.arange(0,380, 5)



In [ ]:

    
fig, ax = plt.subplots(figsize=(8,3), ncols=2, sharey=False)
sns.distplot(fans.distance_m, bins=bins, kde=False, 
             hist_kws={'cumulative':False,'normed':True, 'log':True},
             axlabel='Fan length [m]', ax=ax[0])
sns.distplot(fans.distance_m, bins=bins, kde=False, hist_kws={'cumulative':True,'normed':True},
            axlabel='Fan length [m]', ax=ax[1])
ax[0].set_title("Normalized Log-Histogram of fan lengths ")
ax[1].set_title("Cumulative normalized histogram of fan lengths")
ax[1].set_ylabel("Fraction of fans with given length")
add_percentage_line(ax[1], 100, fans.distance_m)
add_percentage_line(ax[1], 50, fans.distance_m)
fig.tight_layout()
fig.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/fan_lengths_histos.pdf",
            dpi=150, bbox_inches='tight')



In [ ]:

    
fans.query('distance_m>350')[['distance_m', 'obsid', 'l_s']]



In [ ]:

    
fans.distance_m.describe()

In words, the mean length of fans is {{f"{fans.distance_m.describe()['mean']:.1f}"}} m, while the median is {{f"{fans.distance_m.describe()['50%']:.1f}"}} m.



In [ ]:

    
fans.replace("Manhattan_Frontinella", "Manhattan_\nFrontinella", inplace=True)



In [ ]:

    
fig, ax = plt.subplots()
sns.boxplot(y="region", x="distance_m", data=fans, ax=ax,
            fliersize=3)
ax.set_title("Fan lengths in different ROIs")
fig.tight_layout()
fig.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/fan_lengths_vs_regions.pdf",
         dpi=150, bbox_inches='tight')

Blotch sizes



In [ ]:

    
plt.figure()
cols = ['radius_1','radius_2']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.), 
             color=['r','g'], label=cols)
plt.legend()



In [ ]:

    
plt.figure()
cols = ['radius_1_m','radius_2_m']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.), 
             color=['r','g'], label=cols)
plt.legend()



In [ ]:

    
fig, ax = plt.subplots(figsize=(8,4))
sns.distplot(blotches.radius_2_m, bins=500, kde=False, hist_kws={'cumulative':True,'normed':True},
            axlabel='Blotch radius_1 [m]', ax=ax)
ax.set_title("Cumulative normalized histogram for blotch lengths")
ax.set_ylabel("Fraction of blotches with given radius_1")
add_percentage_line(ax, 30, blotches.radius_2_m)
add_percentage_line(ax, 10, blotches.radius_2_m)



In [ ]:

    
import scipy
scipy.stats.percentileofscore(blotches.radius_2_m, 30)



In [ ]:

    
plt.close('all')

Longest fans



In [ ]:

    
fans.query('distance_m > 350')[
    'distance_m distance obsid image_x image_y tile_id'.split()].sort_values(
        by='distance_m')



In [ ]:

    
from planet4 import plotting



In [ ]:

    
plotting.plot_finals('de3', datapath=rm.catalog)
plt.gca().set_title('APF0000de3')



In [ ]:

    
plotting.plot_image_id_pipeline('de3', datapath=rm.catalog, via_obsid=False, figsize=(12,8))



In [ ]:

    
from planet4 import region_data



In [ ]:

    
from planet4 import stats



In [ ]:

    
stats.define_season_column(fans)
stats.define_season_column(blotches)



In [ ]:

    
fans.season.value_counts()



In [ ]:

    
fans.query('season==2').distance_m.median()



In [ ]:

    
fans.query('season==3').distance_m.median()



In [ ]:

    
from planet4 import region_data



In [ ]:

    
for region in ['Manhattan2', 'Giza','Ithaca']:
    print(region)
    obj = getattr(region_data, region)
    for s in ['season2','season3']:
        print(s)
        obsids = getattr(obj, s)
        print(fans[fans.obsid.isin(obsids)].distance_m.median())



In [ ]:

    
db = io.DBManager()



In [ ]:

    
all_data = db.get_all()



In [ ]:

    
image_names = db.image_names



In [ ]:

    
g_all = all_data.groupby('image_id')



In [ ]:

    
g_all.size().sort_values().head()



In [ ]:

    
fans.columns



In [ ]:

    
cols_to_drop = ['path', 'image_name', 'binning', 'LineResolution', 'SampleResolution', 'Line', 'Sample']



In [ ]:

    
fans.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')



In [ ]:

    
fans.columns



In [ ]:

    
fans.iloc[1]

North azimuths



In [ ]:

    
s = """ESP\_011296\_0975 & -82.197 & 225.253 & 178.8 & 2008-12-23  & 17:08 & 91 \\
ESP\_011341\_0980 & -81.797 & 76.13 & 180.8 & 2008-12-27  & 17:06 & 126 \\
ESP\_011348\_0950 & -85.043 & 259.094 & 181.1 & 2008-12-27  & 18:01 & 91 \\
ESP\_011350\_0945 & -85.216 & 181.415 & 181.2 & 2008-12-27  & 16:29 & 126 \\
ESP\_011351\_0945 & -85.216 & 181.548 & 181.2 & 2008-12-27  & 18:18 & 91 \\
ESP\_011370\_0980 & -81.925 & 4.813 & 182.1 & 2008-12-29  & 17:08 & 126 \\
ESP\_011394\_0935 & -86.392 & 99.068 & 183.1 & 2008-12-31  & 19:04 & 72 \\
ESP\_011403\_0945 & -85.239 & 181.038 & 183.5 & 2009-01-01  & 16:56 & 164 \\
ESP\_011404\_0945 & -85.236 & 181.105 & 183.6 & 2009-01-01  & 18:45 & 91 \\
ESP\_011406\_0945 & -85.409 & 103.924 & 183.7 & 2009-01-01  & 17:15 & 126 \\
ESP\_011407\_0945 & -85.407 & 103.983 & 183.7 & 2009-01-01  & 19:04 & 91 \\
ESP\_011408\_0930 & -87.019 & 86.559 & 183.8 & 2009-01-01  & 19:43 & 59 \\
ESP\_011413\_0970 & -82.699 & 273.129 & 184.0 & 2009-01-01  & 17:17 & 108 \\
ESP\_011420\_0930 & -87.009 & 127.317 & 184.3 & 2009-01-02  & 20:16 & 54 \\
ESP\_011422\_0930 & -87.041 & 72.356 & 184.4 & 2009-01-02  & 20:15 & 54 \\
ESP\_011431\_0930 & -86.842 & 178.244 & 184.8 & 2009-01-03  & 19:41 & 54 \\
ESP\_011447\_0950 & -84.805 & 65.713 & 185.5 & 2009-01-04  & 17:19 & 218 \\
ESP\_011448\_0950 & -84.806 & 65.772 & 185.6 & 2009-01-04  & 19:09 & 59 \\"""



In [ ]:

    
lines = s.split(' \\')



In [ ]:

    
s.replace('\\', '')



In [ ]:

    
obsids = [line.split('&')[0].strip().replace('\\','') for line in lines][:-1]



In [ ]:

    
meta = pd.read_csv(rm.metadata_path)



In [ ]:

    
meta.query('obsid in @obsids').sort_values(by='obsid').



In [ ]:

    
blotches.groupby('obsid').north_azimuth.nunique()

User stats



In [ ]:

    
db = io.DBManager()



In [ ]:

    
db.dbname = '/Users/klay6683/local_data/planet4/2018-02-11_planet_four_classifications_queryable_cleaned_seasons2and3.h5'



In [ ]:

    
with pd.HDFStore(str(db.dbname)) as store:
    user_names = store.select_column('df', 'user_name').unique()



In [ ]:

    
user_names.shape



In [ ]:

    
user_names[:10]



In [ ]:

    
not_logged = [i for i in user_names if i.startswith('not-logged-in')]



In [ ]:

    
logged = list(set(user_names) - set(not_logged))



In [ ]:

    
len(logged)



In [ ]:

    
len(not_logged)



In [ ]:

    
not_logged[:20]



In [ ]:

    
df = db.get_all()



In [ ]:

    
df[df.marking=='fan'].shape



In [ ]:

    
df[df.marking=='blotch'].shape



In [ ]:

    
df[df.marking=='interesting'].shape



In [ ]:

    
n_class_by_user = df.groupby('user_name').classification_id.nunique()



In [ ]:

    
n_class_by_user.describe()



In [ ]:

    
logged_users = df.user_name[~df.user_name.str.startswith("not-logged-in")].unique()



In [ ]:

    
logged_users.shape



In [ ]:

    
not_logged = list(set(df.user_name.unique()) - set(logged_users))



In [ ]:

    
len(not_logged)



In [ ]:

    
n_class_by_user[not_logged].describe()



In [ ]:

    
n_class_by_user[logged_users].describe()



In [ ]:

    
n_class_by_user[n_class_by_user>50].shape[0]/n_class_by_user.shape[0]



In [ ]:

    
n_class_by_user.shape

pipeline output examples



In [ ]:

    
pm = io.PathManager('any', datapath=rm.savefolder)



In [ ]:

    
cols1 = pm.fandf.columns[:8]
cols2 = pm.fandf.columns[8:-2]
cols3 = pm.fandf.columns[-2:]



In [ ]:

    
print(pm.fandf[cols1].to_latex())



In [ ]:

    
print(pm.fandf[cols2].to_latex())



In [ ]:

    
print(pm.fandf[cols3].to_latex())



In [ ]:

    
df = pm.fnotchdf.head(4)



In [ ]:

    
cols1 = df.columns[:6]
cols2 = df.columns[6:14]
cols3 = df.columns[14:]



In [ ]:

    
for i in [1,2,3]:
    print(df[eval(f"cols{i}")].to_latex())



In [ ]:

Table of Contents