In [2]:
%matplotlib inline
In [3]:
import pandas as pd
In [4]:
PATH = 'data/'
In [5]:
from glob import glob
In [6]:
import os; import sys
In [13]:
g = glob(PATH + f'planet*'); g
Out[13]:
In [14]:
f'HELLO DUDE'.split(' ')
Out[14]:
In [16]:
fname = g[0].split('/')[1]; fname
Out[16]:
In [54]:
def reload_content():
with open(PATH + fname) as f:
content = f.readlines()
return content
reload_content();
In [25]:
content[:50]
Out[25]:
In [33]:
ctemp = content[:20]
In [35]:
temp = 'somestring'
In [38]:
temp.count('s')
Out[38]:
In [44]:
for line in ctemp:
print(f'{True if line.count("(") > 0 else False}\n{line}')
In [45]:
import numpy as np
In [ ]:
content =
In [46]:
content = np.array(content)
In [47]:
content.shape
Out[47]:
In [49]:
content = content[np.where(line.count("(") > 0 for line in content)]
In [55]:
content = reload_content()
In [57]:
content = np.array(content)
In [58]:
content.shape
Out[58]:
In [63]:
idxs = np.where([line.count("(") > 0 for line in content])[0]
In [64]:
idxs
Out[64]:
Looks like I made a mistake with how I was getting my indices... they were inside a 1D tuple for some reason.. oops.
In [65]:
content = content[np.where([line.count("(") > 0 for line in content])[0]]
In [66]:
content.shape
Out[66]:
That shaved off only a few lines; but those are lines that didn't have a planet ID number in parens in them. I think I can treat the rest of the array as containing planet names now.
In [67]:
??np.random.randint
In [121]:
content[np.random.randint(0, len(content))]
Out[121]:
Ctrl-Tabbing the above line a few times, looks like the pattern is:
(ID Number) PLANET_NAME looong space PLANET_NAME \\n
I can do a quick check to see if there are any planet names with a single space in them. After that I'll just grab whatever text string comes after the ID number and write that to a csv. Maybe I'll include the ID number just to be thorough.
NOTE: I love that there's a planet 'Karvelia'. Somewhere out there's a Georgian astronomer with stellar aspirations :D
In [129]:
temp = content[0].split(' '); temp
Out[129]:
In [130]:
temp = [elem for elem in temp if elem != '']; temp
Out[130]:
In [131]:
temp[-2], temp[-1]
Out[131]:
In [132]:
len(temp[-2])
Out[132]:
Idea: count the number of elements after the ID string index. If it's greater than 2, we have a name split by a space.
In [186]:
check = [[token for token in line if token != ''][-3][0] == '(' for line in content]
In [198]:
line = content[0].split(' '); line
tokens = [token for token in line if token != '']
is_space = tokens[-3][0] != '('
print(f'line:\n{line}\n\ntokens:\n{tokens}\n\nspace:\n{is_space}')
In [204]:
spc_idxs = []
for idx, line in enumerate(content):
tokens = [token for token in line.split(' ') if token != '']
if tokens [-3][0] != '(':
spc_idxs.append(idx)
In [206]:
len(spc_idxs)
# spc_idxs[0]
Out[206]:
There are 511 of these names? Let's see.
In [208]:
content[np.random.choice(spc_idxs)]
Out[208]:
Ah, there's one.
In [228]:
content[np.random.choice(spc_idxs)]
Out[228]:
Okay here's what I'll do. Go through all these indices, keep track of the first token after the ID number, and take all the tokens after it until I see that same string again. This will ignore names like 'Make Make" (if I got that right), but... whatever.
In [252]:
df_line = []
line = content[0]
# break line into tokens, remove empty strings; find idx of ID token
tokens = [token for token in line.split(' ') if token != '']
for idx, token in enumerate(tokens):
if token[0] == '(':
start_idx = idx
break
# comma-join ID-Number and first name token
new_line = ','.join((tokens[start_idx], tokens[start_idx+1]))
namelen = len(tokens[start_idx+1]) # used to identify repeat
# space-join all name tokens after first, until see first again
for token in tokens[start_idx+2:]:
if len(token) > namelen and token[:namelen] == tokens[start_idx+1]:
break
new_line = ' '.join((newline, token))
df_line.append(new_line)
In [253]:
print(f'{[line]}\n{tokens}\nnamelen: {namelen}\n{new_line}\n{df_line}')
Looks good. Let's put it to work.
In [293]:
def get_name_id(line):
"""Get CSV ID and Planet Name
Parameters
----------
line : string
text line from IAU Minor Planet Names list. See: http://www.minorplanetcenter.net/iau/lists/MPNames.html
Returns
-------
new_line : string
Comma-Separated Value of PLANET_ID,PLANET_NAME"""
# break line into tokens, remove empty strings; find idx of ID token
tokens = [token for token in line.split(' ') if token != '']
for idx, token in enumerate(tokens):
if token[0] == '(':
start_idx = idx
break
# comma-join ID-Number and first name token
new_line = ','.join((tokens[start_idx], tokens[start_idx+1]))
namelen = len(tokens[start_idx+1]) # used to identify repeat
# space-join all name tokens after first, until see first again
for token in tokens[start_idx+2:]:
if len(token) > namelen and token[:namelen] == tokens[start_idx+1]:
break
new_line = ' '.join((new_line, token))
# cut new_line at first index of `\` if it exists
new_line = new_line[:new_line.index('\\')] if '\\' in new_line else new_line
return new_line
Result after fixing code to check for backslashes in names:
In [322]:
csv_content = [get_name_id(line) for line in content]
csv_content[33]
Out[322]:
First run through with the function above. Had an issue with backslashes appearing in names -- My code wasn't looking for that case.
In [282]:
csv_content = [get_name_id(line) for line in content]
In [289]:
'\\' in temp
Out[289]:
In [291]:
temp = temp[:temp.index('\\')] if '\\' in temp else temp
In [292]:
temp
Out[292]:
In [287]:
temp[:temp.index('\\')]
Out[287]:
In [286]:
temp = "(274302),Abahazi Abah\\'e1zi\\\n"
temp.index('\\')
f'{temp[:21]}'
Out[286]:
In [283]:
csv_content[:10]
Out[283]:
In [284]:
csv_content[33]
Out[284]:
In [262]:
for i in range(50): print(csv_content[i])
Now we have our csv'ify function ready and our lines of csv-formatted text. Time to convert and save them as a pandas DataFrame.
In [307]:
planets_df = pd.DataFrame([line.split(',') for line in csv_content])
In [308]:
planets_df.head()
Out[308]:
In [301]:
'(123)'.split('(')[1].split(')')[0]
Out[301]:
In [303]:
'(123)'[1:-1]
Out[303]:
In [311]:
planets_df[0].head()
Out[311]:
I want the ID numbers to not be in parens, so they can easily be converted to integers, so I'll redo that now.
In [328]:
ids = [line.split(',')[0] for line in csv_content]; ids[:5]
Out[328]:
In [329]:
ids = [ID[1:-1] for ID in ids]; ids[:5]
Out[329]:
In [330]:
planets_df[0] = ids
In [331]:
planets_df.head()
Out[331]:
That looks better. Now to give column names, save to disk, and write a random name generator function.
In [332]:
planets_df.columns = ['ids', 'names']
In [333]:
planets_df.head()
Out[333]:
In [336]:
# saving DataFrame to disk
planets_df.to_csv(PATH + f'planet_names.csv', index=False)
In [339]:
# the original data
content[:10]
Out[339]:
In [340]:
type(None)
Out[340]:
In [515]:
# import numpy as np
# import pandas as pd
# fpath='data/planet_names.csv'
# names_df = pd.read_csv(fpath)
def get_planet_name(seed=None, dataframe=None, fpath=None):
"""Return a random name from the IAU Minor Planet Names list.
Parameters
----------
seed : integer (optional)
random seed for NumPy random number generator
dataframe : Pandas DataFrame (optional)
csv file to read names from.
function will attempt to load DataFrame from `fpath` if `dataframe`
not specified.
fpath: string (optional)
path to csv file containing planet names.
only used if `dataframe` not specified.
both Pandas and NumPy will be imported in this case.
Returns
-------
name : string
a random IAU Minor Planet name, and its ID for future reference
"""
# No DataFrame specified
if type(dataframe) == type(None):
if type(fpath) == type(None):
print("No DataFrame or file path specified.")
return
# import numpy as np
# import pandas as pd
names_df = pd.read_csv(fpath)
# DataFrame specified
else:
names_df = dataframe
# dont know why Im getting an UnboundLocalError in this function
# NOTE: very interesting... if I ever import a module, even in a conditional,
# every instance of that module not initialized from that import will throw an
# UnboundLocalError. Hmm... okay.
if type(seed) != type(None):
np.random.seed(seed)
# try:
# np.random.seed(seed)
# except UnboundLocalError:
# import numpy as np
# np.random.seed(seed)
idx = np.random.randint(0, len(names_df))
return names_df.iloc[idx][1], names_df.iloc[idx][0]
get_planet_name(dataframe=planets_df)
Out[515]:
In [397]:
def temp(df=None):
idx = np.random.randint(0, len(df))
return df.iloc[idx][1]
temp(df=names_df)
Out[397]:
Some tests of the function in action:
In [519]:
get_planet_name(dataframe=names_df)
Out[519]:
In [524]:
for i in range(50):
print(f'{get_planet_name(dataframe=names_df)[0]}')
To my infinite suprise and not at all sarcastic joy... Most of these 'planet names' are just the names of the guys who discovered them. Great....
Well I did learn a lot about manipulating unformatted data so... yay!
In [525]:
for i in range(50):
print(f'{get_planet_name(dataframe=names_df)[0]}')
Oh hell yeah, let's go to planet Salmon..
In [534]:
for i in range(20): print(f'{get_planet_name(dataframe=names_df)[0]}')
In [ ]:
In [ ]:
More debugging with imports:
In [382]:
# why does this not give me an unbound error...
tempfn = lambda x: np.log(x)
tempfn(3)
Out[382]:
In [385]:
del tempfn
In [388]:
# and neither does this.....
def tempfn(x):
return np.log(x)
tempfn(3)
Out[388]:
In [394]:
del tempfn
def tempfn(x):
np.random.seed(0)
return np.random.randint(0,10)
# return np.log(x)
tempfn(3)
Out[394]:
In [390]:
get_planet_name(dataframe=planets_df)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
Some tests before I got the function working, below:
In [341]:
len(planets_df)
Out[341]:
In [344]:
len(planets_df['ids'])
Out[344]:
In [350]:
planets_df.iloc[0]
Out[350]:
In [351]:
temp = planets_df.iloc[0]
In [352]:
temp
Out[352]:
In [353]:
temp[0]
Out[353]:
In [354]:
temp[1]
Out[354]:
In [359]:
temp = planets_df.iloc[0][1], planets_df.iloc[0][0]; temp
Out[359]:
In [360]:
temp[0]
Out[360]:
In [358]:
temp = "X"; temp
Out[358]:
In [361]:
temp = "'X"; temp
Out[361]:
In [ ]: