In [ ]:
%matplotlib ipympl
In [ ]:
import seaborn as sns
sns.set()
sns.set_context('paper')
sns.set_palette('colorblind')
from planet4 import io, stats, markings, plotting, region_data
from planet4.catalog_production import ReleaseManager
In [ ]:
fans = pd.read_csv("/Users/klay6683/Dropbox/data/planet4/p4_analysis/P4_catalog_v1.0/P4_catalog_v1.0_L1C_cut_0.5_fan_meta_merged.csv")
blotch = pd.read_csv("/Users/klay6683/Dropbox/data/planet4/p4_analysis/P4_catalog_v1.0/P4_catalog_v1.0_L1C_cut_0.5_blotch_meta_merged.csv")
In [ ]:
pd.set_option("display.max_columns", 150)
In [ ]:
fans.head()
In [ ]:
fans.l_s.head().values[0]
In [ ]:
group_blotch = blotch.groupby("obsid")
In [ ]:
type(group_blotch)
In [ ]:
counts = group_blotch.marking_id.count()
In [ ]:
counts.head()
In [ ]:
counts.plot(c='r')
In [ ]:
plt.figure()
counts.hist()
In [ ]:
counts.max()
In [ ]:
counts.min()
In [ ]:
fans.head()
In [ ]:
plt.figure(constrained_layout=True)
counts[:20].plot.bar()
In [ ]:
plt.figure()
counts[:10].plot(use_index=True)
In [ ]:
plt.figure()
counts[:10]
In [ ]:
In [ ]:
In [ ]:
grouped = fans.groupby("obsid")
In [ ]:
grouped.tile_id.nunique().sort_values(ascending=False).head()
In [ ]:
In [ ]:
%matplotlib inline
In [ ]:
from planet4.markings import ImageID
In [ ]:
p4id = ImageID('7t9')
In [ ]:
p4id.image_name
In [ ]:
p4id.plot_fans()
In [ ]:
filtered = fans[fans.tile_id=='APF0000cia']
In [ ]:
filtered.shape
In [ ]:
p4id.plot_fans(data=filtered)
In [ ]:
In [ ]:
import dask.dataframe as dd
In [ ]:
db = io.DBManager()
db.dbname
In [ ]:
df = dd.read_hdf(db.dbname, 'df')
In [ ]:
df.columns
In [ ]:
grp = df.groupby(['user_name'])
In [ ]:
s = grp.classification_id.nunique().compute().sort_values(ascending=False).head(5)
In [ ]:
s
In [ ]:
rm = ReleaseManager('v1.0')
In [ ]:
db = io.DBManager()
In [ ]:
data = db.get_all()
In [ ]:
In [ ]:
fans = pd.read_csv(rm.fan_merged)
In [ ]:
fans.shape
In [ ]:
fans.columns
In [ ]:
from planet4.stats import define_season_column
In [ ]:
define_season_column(fans)
In [ ]:
fans.columns
In [ ]:
In [ ]:
season2 = fans[fans.season==2]
In [ ]:
season2.shape
In [ ]:
img223 = fans.query("image_name=='ESP_012265_0950'")
In [ ]:
img223.shape
In [ ]:
plt.figure()
img223.angle.hist()
In [ ]:
fans.season.dtype
In [ ]:
meta = pd.read_csv(rm.metadata_path, dtype='str')
In [ ]:
cols_to_merge = ['OBSERVATION_ID',
'SOLAR_LONGITUDE', 'north_azimuth', 'map_scale']
In [ ]:
fans = fans.merge(meta[cols_to_merge], left_on='obsid', right_on='OBSERVATION_ID')
In [ ]:
fans.drop(rm.DROP_FOR_FANS, axis=1, inplace=True)
In [ ]:
fans.image_x.head()
In [ ]:
ground['image_x'] = pd.to_numeric(ground.image_x)
In [ ]:
ground['image_y'] = pd.to_numeric(ground.image_y)
In [ ]:
fans_new = fans.merge(ground[rm.COLS_TO_MERGE], on=['obsid', 'image_x', 'image_y'])
In [ ]:
fans_new.shape
In [ ]:
fans.shape
In [ ]:
s = pd.to_numeric(ground.BodyFixedCoordinateX)
In [ ]:
s.head()
In [ ]:
s.round(decimals=4)
In [ ]:
In [ ]:
In [ ]:
blotches = rm.read_blotch_file().assign(marking='blotch')
fans = rm.read_fan_file().assign(marking='fan')
combined = pd.concat([blotches, fans], ignore_index=True)
blotches.head()
In [ ]:
fans.columns
In [ ]:
cols1 = fans.columns[:13]
print(cols1)
cols2 = fans.columns[13:-4]
print(cols2)
cols3 = fans.columns[-4:-1]
cols3
In [ ]:
fanshead1 = fans[cols1].head(10)
fanshead2 = fans[cols2].head(10)
fanshead3 = fans[cols3].head(10)
In [ ]:
with open("fan_table1.tex", 'w') as f:
f.write(fanshead1.to_latex())
In [ ]:
with open("fan_table2.tex", 'w') as f:
f.write(fanshead2.to_latex())
In [ ]:
with open("fan_table3.tex", 'w') as f:
f.write(fanshead3.to_latex())
In [ ]:
for Reg in region_data.regions:
reg = Reg()
print(reg.name)
combined.loc[combined.obsid.isin(reg.all_obsids), 'region'] = reg.name
fans.loc[fans.obsid.isin(reg.all_obsids), 'region']= reg.name
blotches.loc[blotches.obsid.isin(reg.all_obsids), 'region'] = reg.name
In [ ]:
tiles_marked = combined.tile_id.unique()
In [ ]:
db = io.DBManager()
In [ ]:
input_tiles = db.image_ids
input_tiles.shape[0]
In [ ]:
n_empty = input_tiles.shape[0] - tiles_marked.shape[0]
n_empty
In [ ]:
n_empty / input_tiles.shape[0]
In [ ]:
empty_tiles = list(set(input_tiles) - set(tiles_marked))
In [ ]:
all_data = db.get_all()
In [ ]:
all_data.set_index('image_id', inplace=True)
In [ ]:
empty_data = all_data.loc[empty_tiles]
In [ ]:
meta = pd.read_csv(rm.metadata_path)
meta.head()
In [ ]:
empty_tile_numbers = empty_data.reset_index().groupby('image_name')[['x_tile', 'y_tile']].max()
In [ ]:
empty_tile_numbers['total'] = empty_tile_numbers.x_tile*empty_tile_numbers.y_tile
In [ ]:
empty_tile_numbers.head()
In [ ]:
n_empty_per_obsid = empty_data.reset_index().groupby('image_name').image_id.nunique()
In [ ]:
n_empty_per_obsid = n_empty_per_obsid.to_frame()
In [ ]:
n_empty_per_obsid.columns = ['n']
In [ ]:
df = n_empty_per_obsid
In [ ]:
df = df.join(empty_tile_numbers.total)
In [ ]:
df = df.assign(ratio=df.n/df.total)
In [ ]:
df = df.join(meta.set_index('OBSERVATION_ID'))
In [ ]:
df['scaled_n'] = df.n / df.map_scale / df.map_scale
In [ ]:
import seaborn as sns
sns.set_context('notebook')
In [ ]:
df.plot(kind='scatter', y='ratio', x='SOLAR_LONGITUDE')
ax = plt.gca()
ax.set_ylabel('Fraction of empty tiles per HiRISE image')
ax.set_xlabel('Solar Longitude [$^\circ$]')
ax.set_title("Distribution of empty tiles vs time")
plt.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/empty_data_vs_ls.pdf")
In [ ]:
df[df.ratio > 0.8]
In [ ]:
sample = np.random.choice(empty_tiles, 200)
In [ ]:
cd plots
In [ ]:
from tqdm import tqdm
In [ ]:
for image_id in tqdm(sample):
fig, ax = plt.subplots(ncols=2)
plotting.plot_raw_fans(image_id, ax=ax[0])
plotting.plot_raw_blotches(image_id, ax=ax[1])
fig.savefig(f"empty_tiles/{image_id}_input_markings.png", dpi=150)
plt.close('all')
In [ ]:
fans_per_tile = fans.groupby('tile_id').size().sort_values(ascending=False)
fans_per_tile.head()
In [ ]:
blotches_per_tile = blotches.groupby('tile_id').size().sort_values(ascending=False)
blotches_per_tile.head()
In [ ]:
print(fans_per_tile.median())
blotches_per_tile.median()
In [ ]:
plt.close('all')
In [ ]:
by_image_id = combined.groupby(['marking', 'tile_id']).size()
In [ ]:
by_image_id.name = 'Markings per tile'
In [ ]:
by_image_id = by_image_id.reset_index()
In [ ]:
by_image_id.columns
In [ ]:
g = sns.FacetGrid(by_image_id, col="marking", aspect=1.2)
bins = np.arange(0, 280, 5)
g.map(sns.distplot, 'Markings per tile', kde=False, bins=bins, hist_kws={'log':True})
plt.savefig('/Users/klay6683/Dropbox/src/p4_paper1/figures/number_distributions.pdf', dpi=150)
In [ ]:
blotches_per_tile.median()
In [ ]:
from planet4 import plotting
In [ ]:
# %load -n plotting.plot_finals_with_input
def plot_finals_with_input(id_, datapath=None, horizontal=True, scope='planet4'):
imgid = markings.ImageID(id_, scope=scope)
pm = io.PathManager(id_=id_, datapath=datapath)
if horizontal is True:
kwargs = {'ncols': 2}
else:
kwargs = {'nrows': 2}
fig, ax = plt.subplots(figsize=(4,5), **kwargs)
ax[0].set_title(imgid.imgid, fontsize=8)
imgid.show_subframe(ax=ax[0])
for marking in ['fan', 'blotch']:
try:
df = getattr(pm, f"final_{marking}df")
except:
continue
else:
data = df[df.image_id == imgid.imgid]
imgid.plot_markings(marking, data, ax=ax[1])
fig.subplots_adjust(top=0.95,bottom=0, left=0, right=1, hspace=0.01, wspace=0.01)
fig.savefig(f"/Users/klay6683/Dropbox/src/p4_paper1/figures/{imgid.imgid}_final.png",
dpi=150)
In [ ]:
plot_finals_with_input('7t9', rm.savefolder, horizontal=False)
In [ ]:
markings.ImageID('7t9').image_name
In [ ]:
fans['distance_m'] = fans.distance*fans.map_scale
blotches['radius_1_m'] = blotches.radius_1*blotches.map_scale
blotches['radius_2_m'] = blotches.radius_2*blotches.map_scale
In [ ]:
n_fan_in = 2792963
In [ ]:
fans.shape[0]
In [ ]:
fans.shape[0] / n_fan_in
In [ ]:
import scipy
scipy.stats.percentileofscore(fans.distance_m, 100)
Cumulative histogram of fan lengths
In [ ]:
def add_percentage_line(ax, meters, column):
y = scipy.stats.percentileofscore(column, meters)
ax.axhline(y/100, linestyle='dashed', color='black', lw=1)
ax.axvline(meters, linestyle='dashed', color='black', lw=1)
ax.text(meters, y/100, f"{y/100:0.2f}")
In [ ]:
plt.close('all')
In [ ]:
fans.distance_m.max()
In [ ]:
bins = np.arange(0,380, 5)
In [ ]:
fig, ax = plt.subplots(figsize=(8,3), ncols=2, sharey=False)
sns.distplot(fans.distance_m, bins=bins, kde=False,
hist_kws={'cumulative':False,'normed':True, 'log':True},
axlabel='Fan length [m]', ax=ax[0])
sns.distplot(fans.distance_m, bins=bins, kde=False, hist_kws={'cumulative':True,'normed':True},
axlabel='Fan length [m]', ax=ax[1])
ax[0].set_title("Normalized Log-Histogram of fan lengths ")
ax[1].set_title("Cumulative normalized histogram of fan lengths")
ax[1].set_ylabel("Fraction of fans with given length")
add_percentage_line(ax[1], 100, fans.distance_m)
add_percentage_line(ax[1], 50, fans.distance_m)
fig.tight_layout()
fig.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/fan_lengths_histos.pdf",
dpi=150, bbox_inches='tight')
In [ ]:
fans.query('distance_m>350')[['distance_m', 'obsid', 'l_s']]
In [ ]:
fans.distance_m.describe()
In words, the mean length of fans is {{f"{fans.distance_m.describe()['mean']:.1f}"}} m, while the median is {{f"{fans.distance_m.describe()['50%']:.1f}"}} m.
In [ ]:
fans.replace("Manhattan_Frontinella", "Manhattan_\nFrontinella", inplace=True)
In [ ]:
fig, ax = plt.subplots()
sns.boxplot(y="region", x="distance_m", data=fans, ax=ax,
fliersize=3)
ax.set_title("Fan lengths in different ROIs")
fig.tight_layout()
fig.savefig("/Users/klay6683/Dropbox/src/p4_paper1/figures/fan_lengths_vs_regions.pdf",
dpi=150, bbox_inches='tight')
In [ ]:
plt.figure()
cols = ['radius_1','radius_2']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.),
color=['r','g'], label=cols)
plt.legend()
In [ ]:
plt.figure()
cols = ['radius_1_m','radius_2_m']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.),
color=['r','g'], label=cols)
plt.legend()
In [ ]:
fig, ax = plt.subplots(figsize=(8,4))
sns.distplot(blotches.radius_2_m, bins=500, kde=False, hist_kws={'cumulative':True,'normed':True},
axlabel='Blotch radius_1 [m]', ax=ax)
ax.set_title("Cumulative normalized histogram for blotch lengths")
ax.set_ylabel("Fraction of blotches with given radius_1")
add_percentage_line(ax, 30, blotches.radius_2_m)
add_percentage_line(ax, 10, blotches.radius_2_m)
In [ ]:
import scipy
scipy.stats.percentileofscore(blotches.radius_2_m, 30)
In [ ]:
plt.close('all')
In [ ]:
fans.query('distance_m > 350')[
'distance_m distance obsid image_x image_y tile_id'.split()].sort_values(
by='distance_m')
In [ ]:
from planet4 import plotting
In [ ]:
plotting.plot_finals('de3', datapath=rm.catalog)
plt.gca().set_title('APF0000de3')
In [ ]:
plotting.plot_image_id_pipeline('de3', datapath=rm.catalog, via_obsid=False, figsize=(12,8))
In [ ]:
from planet4 import region_data
In [ ]:
from planet4 import stats
In [ ]:
stats.define_season_column(fans)
stats.define_season_column(blotches)
In [ ]:
fans.season.value_counts()
In [ ]:
fans.query('season==2').distance_m.median()
In [ ]:
fans.query('season==3').distance_m.median()
In [ ]:
from planet4 import region_data
In [ ]:
for region in ['Manhattan2', 'Giza','Ithaca']:
print(region)
obj = getattr(region_data, region)
for s in ['season2','season3']:
print(s)
obsids = getattr(obj, s)
print(fans[fans.obsid.isin(obsids)].distance_m.median())
In [ ]:
db = io.DBManager()
In [ ]:
all_data = db.get_all()
In [ ]:
image_names = db.image_names
In [ ]:
g_all = all_data.groupby('image_id')
In [ ]:
g_all.size().sort_values().head()
In [ ]:
fans.columns
In [ ]:
cols_to_drop = ['path', 'image_name', 'binning', 'LineResolution', 'SampleResolution', 'Line', 'Sample']
In [ ]:
fans.drop(cols_to_drop, axis=1, inplace=True, errors='ignore')
In [ ]:
fans.columns
In [ ]:
fans.iloc[1]
In [ ]:
s = """ESP\_011296\_0975 & -82.197 & 225.253 & 178.8 & 2008-12-23 & 17:08 & 91 \\
ESP\_011341\_0980 & -81.797 & 76.13 & 180.8 & 2008-12-27 & 17:06 & 126 \\
ESP\_011348\_0950 & -85.043 & 259.094 & 181.1 & 2008-12-27 & 18:01 & 91 \\
ESP\_011350\_0945 & -85.216 & 181.415 & 181.2 & 2008-12-27 & 16:29 & 126 \\
ESP\_011351\_0945 & -85.216 & 181.548 & 181.2 & 2008-12-27 & 18:18 & 91 \\
ESP\_011370\_0980 & -81.925 & 4.813 & 182.1 & 2008-12-29 & 17:08 & 126 \\
ESP\_011394\_0935 & -86.392 & 99.068 & 183.1 & 2008-12-31 & 19:04 & 72 \\
ESP\_011403\_0945 & -85.239 & 181.038 & 183.5 & 2009-01-01 & 16:56 & 164 \\
ESP\_011404\_0945 & -85.236 & 181.105 & 183.6 & 2009-01-01 & 18:45 & 91 \\
ESP\_011406\_0945 & -85.409 & 103.924 & 183.7 & 2009-01-01 & 17:15 & 126 \\
ESP\_011407\_0945 & -85.407 & 103.983 & 183.7 & 2009-01-01 & 19:04 & 91 \\
ESP\_011408\_0930 & -87.019 & 86.559 & 183.8 & 2009-01-01 & 19:43 & 59 \\
ESP\_011413\_0970 & -82.699 & 273.129 & 184.0 & 2009-01-01 & 17:17 & 108 \\
ESP\_011420\_0930 & -87.009 & 127.317 & 184.3 & 2009-01-02 & 20:16 & 54 \\
ESP\_011422\_0930 & -87.041 & 72.356 & 184.4 & 2009-01-02 & 20:15 & 54 \\
ESP\_011431\_0930 & -86.842 & 178.244 & 184.8 & 2009-01-03 & 19:41 & 54 \\
ESP\_011447\_0950 & -84.805 & 65.713 & 185.5 & 2009-01-04 & 17:19 & 218 \\
ESP\_011448\_0950 & -84.806 & 65.772 & 185.6 & 2009-01-04 & 19:09 & 59 \\"""
In [ ]:
lines = s.split(' \\')
In [ ]:
s.replace('\\', '')
In [ ]:
obsids = [line.split('&')[0].strip().replace('\\','') for line in lines][:-1]
In [ ]:
meta = pd.read_csv(rm.metadata_path)
In [ ]:
meta.query('obsid in @obsids').sort_values(by='obsid').
In [ ]:
blotches.groupby('obsid').north_azimuth.nunique()
In [ ]:
db = io.DBManager()
In [ ]:
db.dbname = '/Users/klay6683/local_data/planet4/2018-02-11_planet_four_classifications_queryable_cleaned_seasons2and3.h5'
In [ ]:
with pd.HDFStore(str(db.dbname)) as store:
user_names = store.select_column('df', 'user_name').unique()
In [ ]:
user_names.shape
In [ ]:
user_names[:10]
In [ ]:
not_logged = [i for i in user_names if i.startswith('not-logged-in')]
In [ ]:
logged = list(set(user_names) - set(not_logged))
In [ ]:
len(logged)
In [ ]:
len(not_logged)
In [ ]:
not_logged[:20]
In [ ]:
df = db.get_all()
In [ ]:
df[df.marking=='fan'].shape
In [ ]:
df[df.marking=='blotch'].shape
In [ ]:
df[df.marking=='interesting'].shape
In [ ]:
n_class_by_user = df.groupby('user_name').classification_id.nunique()
In [ ]:
n_class_by_user.describe()
In [ ]:
logged_users = df.user_name[~df.user_name.str.startswith("not-logged-in")].unique()
In [ ]:
logged_users.shape
In [ ]:
not_logged = list(set(df.user_name.unique()) - set(logged_users))
In [ ]:
len(not_logged)
In [ ]:
n_class_by_user[not_logged].describe()
In [ ]:
n_class_by_user[logged_users].describe()
In [ ]:
n_class_by_user[n_class_by_user>50].shape[0]/n_class_by_user.shape[0]
In [ ]:
n_class_by_user.shape
In [ ]:
pm = io.PathManager('any', datapath=rm.savefolder)
In [ ]:
cols1 = pm.fandf.columns[:8]
cols2 = pm.fandf.columns[8:-2]
cols3 = pm.fandf.columns[-2:]
In [ ]:
print(pm.fandf[cols1].to_latex())
In [ ]:
print(pm.fandf[cols2].to_latex())
In [ ]:
print(pm.fandf[cols3].to_latex())
In [ ]:
df = pm.fnotchdf.head(4)
In [ ]:
cols1 = df.columns[:6]
cols2 = df.columns[6:14]
cols3 = df.columns[14:]
In [ ]:
for i in [1,2,3]:
print(df[eval(f"cols{i}")].to_latex())
In [ ]: