In [2]:
%matplotlib inline

In [3]:
import pandas as pd

In [4]:
PATH = 'data/'

In [5]:
from glob import glob

In [6]:
import os; import sys

In [13]:
g = glob(PATH + f'planet*'); g


Out[13]:
['data/planetnames.rtf']

In [14]:
f'HELLO DUDE'.split(' ')


Out[14]:
['HELLO', 'DUDE']

In [16]:
fname = g[0].split('/')[1]; fname


Out[16]:
'planetnames.rtf'

In [54]:
def reload_content():
    with open(PATH + fname) as f:
        content = f.readlines()
    return content

reload_content();

In [25]:
content[:50]


Out[25]:
['{\\rtf1\\ansi\\ansicpg1252\\cocoartf1561\\cocoasubrtf100\n',
 '{\\fonttbl\\f0\\fmodern\\fcharset0 Courier;}\n',
 '{\\colortbl;\\red255\\green255\\blue255;\\red0\\green0\\blue0;\\red255\\green255\\blue255;}\n',
 '{\\*\\expandedcolortbl;;\\cssrgb\\c0\\c0\\c0;\\cssrgb\\c100000\\c100000\\c100000;}\n',
 '\\margl1440\\margr1440\\vieww10460\\viewh9600\\viewkind0\n',
 '\\deftab720\n',
 '\\pard\\pardeftab720\\sl264\\partightenfactor0\n',
 '\n',
 '\\f0\\fs24 \\cf2 \\cb3 \\expnd0\\expndtw0\\kerning0\n',
 "\\outl0\\strokewidth0 \\strokec2 (388282) 'Akepa                        'Akepa\\\n",
 "\\ul   (3192) A'Hearn                       A'Hearn\\ulnone \\\n",
 '  (3654) AAS                           AAS\\\n',
 '  (8900) AAVSO                         AAVSO\\\n',
 '  (8721) AMOS                          AMOS\\\n',
 '  (9996) ANS                           ANS\\\n',
 '(132524) APL                           APL\\\n',
 ' (13830) ARLT                          ARLT\\\n',
 ' (31531) ARRL                          ARRL\\\n',
 '  (3568) ASCII                         ASCII\\\n',
 '  (2848) ASP                           ASP\\\n',
 ' (20813) Aakashshah                    Aakashshah\\\n',
 ' (26557) Aakritijain                   Aakritijain\\\n',
 ' (28698) Aakshi                        Aakshi\\\n',
 ' (28828) Aalamiharandi                 Aalamiharandi\\\n',
 ' (33181) Aalokpatwa                    Aalokpatwa\\\n',
 '   (677) Aaltje                        Aaltje\\\n',
 '  (2676) Aarhus                        Aarhus\\\n',
 '(129100) Aaronammons                   Aaronammons\\\n',
 ' (22656) Aaronburrows                  Aaronburrows\\\n',
 ' (25677) Aaronenten                    Aaronenten\\\n',
 ' (11451) Aarongolden                   Aarongolden\\\n',
 ' (23113) Aaronhakim                    Aaronhakim\\\n',
 ' (12553) Aaronritter                   Aaronritter\\\n',
 ' (13928) Aaronrogers                   Aaronrogers\\\n',
 ' (21933) Aaronrozon                    Aaronrozon\\\n',
 ' (29812) Aaronsolomon                  Aaronsolomon\\\n',
 '  (3277) Aaronson                      Aaronson\\\n',
 ' (33448) Aaronyeiser                   Aaronyeiser\\\n',
 '  (9836) Aarseth                       Aarseth\\\n',
 '  (2366) Aaryn                         Aaryn\\\n',
 '   (864) Aase                          Aase\\\n',
 '  (2678) Aavasaksa                     Aavasaksa\\\n',
 "(274302) Abahazi                       Abah\\'e1zi\\\n",
 '  (4466) Abai                          Abai\\\n',
 '  (2722) Abalakin                      Abalakin\\\n',
 '  (1581) Abanderada                    Abanderada\\\n',
 '  (3480) Abante                        Abante\\\n',
 '  (4263) Abashiri                      Abashiri\\\n',
 '  (1390) Abastumani                    Abastumani\\\n',
 '  (5224) Abbe                          Abbe\\\n']

In [33]:
ctemp = content[:20]

In [35]:
temp = 'somestring'

In [38]:
temp.count('s')


Out[38]:
2

In [44]:
for line in ctemp:
    print(f'{True if line.count("(") > 0 else False}\n{line}')


False
{\rtf1\ansi\ansicpg1252\cocoartf1561\cocoasubrtf100

False
{\fonttbl\f0\fmodern\fcharset0 Courier;}

False
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red255\green255\blue255;}

False
{\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000;}

False
\margl1440\margr1440\vieww10460\viewh9600\viewkind0

False
\deftab720

False
\pard\pardeftab720\sl264\partightenfactor0

False


False
\f0\fs24 \cf2 \cb3 \expnd0\expndtw0\kerning0

True
\outl0\strokewidth0 \strokec2 (388282) 'Akepa                        'Akepa\

True
\ul   (3192) A'Hearn                       A'Hearn\ulnone \

True
  (3654) AAS                           AAS\

True
  (8900) AAVSO                         AAVSO\

True
  (8721) AMOS                          AMOS\

True
  (9996) ANS                           ANS\

True
(132524) APL                           APL\

True
 (13830) ARLT                          ARLT\

True
 (31531) ARRL                          ARRL\

True
  (3568) ASCII                         ASCII\

True
  (2848) ASP                           ASP\


In [45]:
import numpy as np

In [ ]:
content =

In [46]:
content = np.array(content)

In [47]:
content.shape


Out[47]:
(21167,)

In [49]:
content = content[np.where(line.count("(") > 0 for line in content)]

In [55]:
content = reload_content()

In [57]:
content = np.array(content)

In [58]:
content.shape


Out[58]:
(21167,)

In [63]:
idxs = np.where([line.count("(") > 0 for line in content])[0]

In [64]:
idxs


Out[64]:
array([    9,    10,    11, ..., 21163, 21164, 21165])

Looks like I made a mistake with how I was getting my indices... they were inside a 1D tuple for some reason.. oops.


In [65]:
content = content[np.where([line.count("(") > 0 for line in content])[0]]

In [66]:
content.shape


Out[66]:
(21157,)

That shaved off only a few lines; but those are lines that didn't have a planet ID number in parens in them. I think I can treat the rest of the array as containing planet names now.


In [67]:
??np.random.randint

In [121]:
content[np.random.randint(0, len(content))]


Out[121]:
'   (781) Kartvelia                     Kartvelia\\\n'

Ctrl-Tabbing the above line a few times, looks like the pattern is:

(ID Number) PLANET_NAME looong space PLANET_NAME \\n

I can do a quick check to see if there are any planet names with a single space in them. After that I'll just grab whatever text string comes after the ID number and write that to a csv. Maybe I'll include the ID number just to be thorough.

NOTE: I love that there's a planet 'Karvelia'. Somewhere out there's a Georgian astronomer with stellar aspirations :D


In [129]:
temp = content[0].split(' '); temp


Out[129]:
['\\outl0\\strokewidth0',
 '\\strokec2',
 '(388282)',
 "'Akepa",
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 "'Akepa\\\n"]

In [130]:
temp = [elem for elem in temp if elem != '']; temp


Out[130]:
['\\outl0\\strokewidth0', '\\strokec2', '(388282)', "'Akepa", "'Akepa\\\n"]

In [131]:
temp[-2], temp[-1]


Out[131]:
("'Akepa", "'Akepa\\\n")

In [132]:
len(temp[-2])


Out[132]:
6

Idea: count the number of elements after the ID string index. If it's greater than 2, we have a name split by a space.


In [186]:
check = [[token for token in line if token != ''][-3][0] == '(' for line in content]

In [198]:
line = content[0].split(' '); line
tokens = [token for token in line if token != '']
is_space = tokens[-3][0] != '('

print(f'line:\n{line}\n\ntokens:\n{tokens}\n\nspace:\n{is_space}')


line:
['\\outl0\\strokewidth0', '\\strokec2', '(388282)', "'Akepa", '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', "'Akepa\\\n"]

tokens:
['\\outl0\\strokewidth0', '\\strokec2', '(388282)', "'Akepa", "'Akepa\\\n"]

space:
False

In [204]:
spc_idxs = []
for idx, line in enumerate(content):
    tokens = [token for token in line.split(' ') if token != '']
    if tokens [-3][0] != '(':
        spc_idxs.append(idx)

In [206]:
len(spc_idxs)
# spc_idxs[0]


Out[206]:
511

There are 511 of these names? Let's see.


In [208]:
content[np.random.choice(spc_idxs)]


Out[208]:
'  (1677) Tycho Brahe                   Tycho Brahe\\\n'

Ah, there's one.


In [228]:
content[np.random.choice(spc_idxs)]


Out[228]:
'  (1781) Van Biesbroeck                Van Biesbroeck\\\n'

Okay here's what I'll do. Go through all these indices, keep track of the first token after the ID number, and take all the tokens after it until I see that same string again. This will ignore names like 'Make Make" (if I got that right), but... whatever.


In [252]:
df_line = []
line = content[0]
# break line into tokens, remove empty strings; find idx of ID token
tokens = [token for token in line.split(' ') if token != '']
for idx, token in enumerate(tokens):
    if token[0] == '(':
        start_idx = idx
        break
# comma-join ID-Number and first name token
new_line = ','.join((tokens[start_idx], tokens[start_idx+1]))
namelen = len(tokens[start_idx+1]) # used to identify repeat
# space-join all name tokens after first, until see first again
for token in tokens[start_idx+2:]:
    if len(token) > namelen and token[:namelen] == tokens[start_idx+1]:
        break
    new_line = ' '.join((newline, token))
df_line.append(new_line)

In [253]:
print(f'{[line]}\n{tokens}\nnamelen: {namelen}\n{new_line}\n{df_line}')


["\\outl0\\strokewidth0 \\strokec2 (388282) 'Akepa                        'Akepa\\\n"]
['\\outl0\\strokewidth0', '\\strokec2', '(388282)', "'Akepa", "'Akepa\\\n"]
namelen: 6
(388282),'Akepa
["(388282),'Akepa"]

Looks good. Let's put it to work.


In [293]:
def get_name_id(line):
    """Get CSV ID and Planet Name
    
    Parameters
    ----------
    line : string
        text line from IAU Minor Planet Names list. See: http://www.minorplanetcenter.net/iau/lists/MPNames.html
        
    Returns
    -------
    new_line : string
        Comma-Separated Value of PLANET_ID,PLANET_NAME"""
    # break line into tokens, remove empty strings; find idx of ID token
    tokens = [token for token in line.split(' ') if token != '']
    for idx, token in enumerate(tokens):
        if token[0] == '(':
            start_idx = idx
            break
    # comma-join ID-Number and first name token
    new_line = ','.join((tokens[start_idx], tokens[start_idx+1]))
    namelen = len(tokens[start_idx+1]) # used to identify repeat
    # space-join all name tokens after first, until see first again
    for token in tokens[start_idx+2:]:
        if len(token) > namelen and token[:namelen] == tokens[start_idx+1]:
            break
        new_line = ' '.join((new_line, token))
        
        # cut new_line at first index of `\` if it exists
        new_line = new_line[:new_line.index('\\')] if '\\' in new_line else new_line
        
    return new_line

Result after fixing code to check for backslashes in names:


In [322]:
csv_content = [get_name_id(line) for line in content]
csv_content[33]


Out[322]:
'(274302),Abahazi Abah'

First run through with the function above. Had an issue with backslashes appearing in names -- My code wasn't looking for that case.


In [282]:
csv_content = [get_name_id(line) for line in content]

In [289]:
'\\' in temp


Out[289]:
True

In [291]:
temp = temp[:temp.index('\\')] if '\\' in temp else temp

In [292]:
temp


Out[292]:
'(274302),Abahazi Abah'

In [287]:
temp[:temp.index('\\')]


Out[287]:
'(274302),Abahazi Abah'

In [286]:
temp = "(274302),Abahazi Abah\\'e1zi\\\n"
temp.index('\\')
f'{temp[:21]}'


Out[286]:
'(274302),Abahazi Abah'

In [283]:
csv_content[:10]


Out[283]:
["(388282),'Akepa",
 "(3192),A'Hearn",
 '(3654),AAS',
 '(8900),AAVSO',
 '(8721),AMOS',
 '(9996),ANS',
 '(132524),APL',
 '(13830),ARLT',
 '(31531),ARRL',
 '(3568),ASCII']

In [284]:
csv_content[33]


Out[284]:
"(274302),Abahazi Abah\\'e1zi\\\n"

In [262]:
for i in range(50): print(csv_content[i])


(388282),'Akepa
(3192),A'Hearn
(3654),AAS
(8900),AAVSO
(8721),AMOS
(9996),ANS
(132524),APL
(13830),ARLT
(31531),ARRL
(3568),ASCII
(2848),ASP
(20813),Aakashshah
(26557),Aakritijain
(28698),Aakshi
(28828),Aalamiharandi
(33181),Aalokpatwa
(677),Aaltje
(2676),Aarhus
(129100),Aaronammons
(22656),Aaronburrows
(25677),Aaronenten
(11451),Aarongolden
(23113),Aaronhakim
(12553),Aaronritter
(13928),Aaronrogers
(21933),Aaronrozon
(29812),Aaronsolomon
(3277),Aaronson
(33448),Aaronyeiser
(9836),Aarseth
(2366),Aaryn
(864),Aase
(2678),Aavasaksa
(274302),Abahazi Abah\'e1zi\

(4466),Abai
(2722),Abalakin
(1581),Abanderada
(3480),Abante
(4263),Abashiri
(1390),Abastumani
(5224),Abbe
(17023),Abbott
(31631),Abbywilliams
(249010),Abdel-Samad
(15262),Abderhalden
(22638),Abdulla
(21483),Abdulrasool
(294600),Abedinabedin
(5379),Abehiroshi
(25410),Abejar

Now we have our csv'ify function ready and our lines of csv-formatted text. Time to convert and save them as a pandas DataFrame.


In [307]:
planets_df = pd.DataFrame([line.split(',') for line in csv_content])

In [308]:
planets_df.head()


Out[308]:
0 1
0 (388282) 'Akepa
1 (3192) A'Hearn
2 (3654) AAS
3 (8900) AAVSO
4 (8721) AMOS

In [301]:
'(123)'.split('(')[1].split(')')[0]


Out[301]:
'123'

In [303]:
'(123)'[1:-1]


Out[303]:
'123'

In [311]:
planets_df[0].head()


Out[311]:
0    (388282)
1      (3192)
2      (3654)
3      (8900)
4      (8721)
Name: 0, dtype: object

I want the ID numbers to not be in parens, so they can easily be converted to integers, so I'll redo that now.


In [328]:
ids = [line.split(',')[0] for line in csv_content]; ids[:5]


Out[328]:
['(388282)', '(3192)', '(3654)', '(8900)', '(8721)']

In [329]:
ids = [ID[1:-1] for ID in ids]; ids[:5]


Out[329]:
['388282', '3192', '3654', '8900', '8721']

In [330]:
planets_df[0] = ids

In [331]:
planets_df.head()


Out[331]:
0 1
0 388282 'Akepa
1 3192 A'Hearn
2 3654 AAS
3 8900 AAVSO
4 8721 AMOS

That looks better. Now to give column names, save to disk, and write a random name generator function.


In [332]:
planets_df.columns = ['ids', 'names']

In [333]:
planets_df.head()


Out[333]:
ids names
0 388282 'Akepa
1 3192 A'Hearn
2 3654 AAS
3 8900 AAVSO
4 8721 AMOS

In [336]:
# saving DataFrame to disk
planets_df.to_csv(PATH + f'planet_names.csv', index=False)

In [339]:
# the original data
content[:10]


Out[339]:
array([ "\\outl0\\strokewidth0 \\strokec2 (388282) 'Akepa                        'Akepa\\\n",
       "\\ul   (3192) A'Hearn                       A'Hearn\\ulnone \\\n",
       '  (3654) AAS                           AAS\\\n',
       '  (8900) AAVSO                         AAVSO\\\n',
       '  (8721) AMOS                          AMOS\\\n',
       '  (9996) ANS                           ANS\\\n',
       '(132524) APL                           APL\\\n',
       ' (13830) ARLT                          ARLT\\\n',
       ' (31531) ARRL                          ARRL\\\n',
       '  (3568) ASCII                         ASCII\\\n'],
      dtype='<U82')

In [340]:
type(None)


Out[340]:
NoneType

In [515]:
# import numpy as np
# import pandas as pd

# fpath='data/planet_names.csv'
# names_df = pd.read_csv(fpath)

def get_planet_name(seed=None, dataframe=None, fpath=None):
    """Return a random name from the IAU Minor Planet Names list.
    
    Parameters
    ----------
    seed : integer (optional)
        random seed for NumPy random number generator
    
    dataframe : Pandas DataFrame (optional)
        csv file to read names from.
        function will attempt to load DataFrame from `fpath` if `dataframe` 
        not specified.
        
    fpath: string (optional)
        path to csv file containing planet names.
        only used if `dataframe` not specified.
        both Pandas and NumPy will be imported in this case.
        
    Returns
    -------
    name : string
        a random IAU Minor Planet name, and its ID for future reference
    """
    
    # No DataFrame specified
    if type(dataframe) == type(None):
        if type(fpath) == type(None):
            print("No DataFrame or file path specified.")
            return
#         import numpy as np
#         import pandas as pd
        names_df = pd.read_csv(fpath)
    # DataFrame specified
    else:
        names_df = dataframe
        
    # dont know why Im getting an UnboundLocalError in this function
    # NOTE: very interesting... if I ever import a module, even in a conditional, 
    # every instance of that module not initialized from that import will throw an 
    # UnboundLocalError. Hmm... okay.
    if type(seed) != type(None):
        np.random.seed(seed)
#         try:
#             np.random.seed(seed)
#         except UnboundLocalError:
#             import numpy as np
#             np.random.seed(seed)
    
    idx = np.random.randint(0, len(names_df))
    
    return names_df.iloc[idx][1], names_df.iloc[idx][0]

get_planet_name(dataframe=planets_df)


Out[515]:
('Akimov', '4521')

In [397]:
def temp(df=None):
    idx = np.random.randint(0, len(df))
    return df.iloc[idx][1]

temp(df=names_df)


Out[397]:
'Verbitskaya'

Some tests of the function in action:


In [519]:
get_planet_name(dataframe=names_df)


Out[519]:
('Arizona', 793)

In [524]:
for i in range(50):
    print(f'{get_planet_name(dataframe=names_df)[0]}')


TRIUMF
Praamzius
Mirano
Saunders
Ayapani
Elodie
Lamarck
Lilyliu
Kopal
Emaparker
Gangkeda
Pohlonski
Duboshin
Sylvania
Stimson
Perrett
Jaisonjain
Angstrom 
Ramonkhanna
Adelgunde
Geisha
Perrine
Nomentum
Chikatoshi
Saskia
Frisia
Caddell
Michaelbecker
Kevlin
Yuuko
Prendergast
Ceraskia
Swift
Viikinkoski
JAXA
Yoshiken
Peterkraft
Zimin
Bretagnon
Doblin D
Naantali
Nishimura
Makibi
Alena
Annona
Degraaff
Joyce
Aletheia
Munkacsy Munk
Sterken

To my infinite suprise and not at all sarcastic joy... Most of these 'planet names' are just the names of the guys who discovered them. Great....

Well I did learn a lot about manipulating unformatted data so... yay!


In [525]:
for i in range(50):
    print(f'{get_planet_name(dataframe=names_df)[0]}')


Mattweegman
Tongkexue
Pamjones
Schoenmaker
Walker
Salmon
Cureau
Aoyagi
Gay-Lussac
Hollyerickson
Grigor'ev
Erdmannsdorff
Karetnikov
Messalina
Jurasek Jura
Fritzleiber
Tucholsky
Jackschmitt
Blankenship
Dolero
Valencia
Emmaburnett
Komendantov
Prime
Lewicki
Amysimon
Munch
Masuo
Zhaojiuzhang
Kathryn
Hermes
Vinceelliott
Odegard
Severochoa
Amandajane
McDermid
Sesar
Gropius
Favaloro
Lyudvasilia
Pianoro
Engelhardt
Kaibab
Goldinaaron
Demodokus
Upupa
Alonso
Osipovyurij
Hirose
Linchisheng

Oh hell yeah, let's go to planet Salmon..


In [534]:
for i in range(20): print(f'{get_planet_name(dataframe=names_df)[0]}')


Duerbeck
Wesson
Klima
Joshwood
Biver
Kangsunwoo
van Genderen van Genderen
Gulkis
Somekawa
Tomconnors
Bulgaria
Matttaylor
Dalestanbridge
Tarsila
Vespa
Hubertreeves
Sveshnikov
Linnaea
Huachucaclub
Covington

In [ ]:


In [ ]:

More debugging with imports:


In [382]:
# why does this not give me an unbound error...
tempfn = lambda x: np.log(x)
tempfn(3)


Out[382]:
1.0986122886681098

In [385]:
del tempfn

In [388]:
# and neither does this.....
def tempfn(x):
    return np.log(x)
tempfn(3)


Out[388]:
1.0986122886681098

In [394]:
del tempfn

def tempfn(x):
    np.random.seed(0)
    return np.random.randint(0,10)
#     return np.log(x)
tempfn(3)


Out[394]:
5

In [390]:
get_planet_name(dataframe=planets_df)


---------------------------------------------------------------------------
UnboundLocalError                         Traceback (most recent call last)
<ipython-input-390-e5e0c06db6c4> in <module>()
----> 1 get_planet_name(dataframe=planets_df)

<ipython-input-389-0f589e53b9a2> in get_planet_name(seed, dataframe, fpath)
     49             np.random.seed(seed)
     50 
---> 51     idx = np.random.randint(0, len(names_df))
     52 
     53     return names_df.iloc[idx][1], names_df.iloc[idx][0]

UnboundLocalError: local variable 'np' referenced before assignment

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Some tests before I got the function working, below:


In [341]:
len(planets_df)


Out[341]:
21157

In [344]:
len(planets_df['ids'])


Out[344]:
21157

In [350]:
planets_df.iloc[0]


Out[350]:
ids      388282
names    'Akepa
Name: 0, dtype: object

In [351]:
temp = planets_df.iloc[0]

In [352]:
temp


Out[352]:
ids      388282
names    'Akepa
Name: 0, dtype: object

In [353]:
temp[0]


Out[353]:
'388282'

In [354]:
temp[1]


Out[354]:
"'Akepa"

In [359]:
temp = planets_df.iloc[0][1], planets_df.iloc[0][0]; temp


Out[359]:
("'Akepa", '388282')

In [360]:
temp[0]


Out[360]:
"'Akepa"

In [358]:
temp = "X"; temp


Out[358]:
'X'

In [361]:
temp = "'X"; temp


Out[361]:
"'X"

In [ ]: