In [2020]:

    
import psycopg2 as pg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline



In [2021]:

    
# 30 May 2015 -> $ heroku pg:pull  HEROKU_POSTGRESQL_COBALT_URL lcp --app lux
!psql lcp --help | head









    



psql is the PostgreSQL interactive terminal.

Usage:
  psql [OPTION]... [DBNAME [USERNAME]]

General options:
  -c, --command=COMMAND    run only single command (SQL or internal) and exit
  -d, --dbname=DBNAME      database name to connect to (default: "excalibur")
  -f, --file=FILENAME      execute commands from file, then exit
  -l, --list               list available databases, then exit



In [2022]:

    
!psql -c "\dt" lcp









    



                List of relations
 Schema |        Name         | Type  |   Owner   
--------+---------------------+-------+-----------
 public | admins              | table | excalibur
 public | comments            | table | excalibur
 public | contributorlinks    | table | excalibur
 public | contributors        | table | excalibur
 public | documents           | table | excalibur
 public | elementals          | table | excalibur
 public | images              | table | excalibur
 public | inclusions          | table | excalibur
 public | items               | table | excalibur
 public | miscs               | table | excalibur
 public | periods             | table | excalibur
 public | petrofabrics        | table | excalibur
 public | petrofabrics_wares  | table | excalibur
 public | petrographics       | table | excalibur
 public | pg_search_documents | table | excalibur
 public | privacylists        | table | excalibur
 public | referencelinks      | table | excalibur
 public | references          | table | excalibur
 public | regionlinks         | table | excalibur
 public | regions             | table | excalibur
 public | schema_migrations   | table | excalibur
 public | shaperelates        | table | excalibur
 public | shapes              | table | excalibur
 public | sites               | table | excalibur
 public | tooltips            | table | excalibur
 public | users               | table | excalibur
 public | viewers             | table | excalibur
 public | voids               | table | excalibur
 public | wares               | table | excalibur
 public | workshoplinks       | table | excalibur
 public | workshops           | table | excalibur
(31 rows)



In [2023]:

    
!psql -c "\d+ wares" lcp









    



                                                           Table "public.wares"
     Column     |            Type             |                     Modifiers                      | Storage  | Stats target | Description 
----------------+-----------------------------+----------------------------------------------------+----------+--------------+-------------
 id             | integer                     | not null default nextval('wares_id_seq'::regclass) | plain    |              | 
 name           | character varying(255)      |                                                    | extended |              | 
 origin         | character varying(255)      |                                                    | extended |              | 
 chron_range    | character varying(255)      |                                                    | extended |              | 
 desc           | text                        |                                                    | extended |              | 
 created_at     | timestamp without time zone |                                                    | plain    |              | 
 updated_at     | timestamp without time zone |                                                    | plain    |              | 
 user_id        | integer                     |                                                    | plain    |              | 
 old_region_id  | integer                     |                                                    | plain    |              | 
 privacy_status | integer                     | default 1                                          | plain    |              | 
 contributor_id | integer                     |                                                    | plain    |              | 
 period         | character varying(255)      |                                                    | extended |              | 
 start_year     | integer                     |                                                    | plain    |              | 
 end_year       | integer                     |                                                    | plain    |              | 
 definition     | text                        |                                                    | extended |              | 
Indexes:
    "wares_pkey" PRIMARY KEY, btree (id)



In [2024]:

    
psql_lcp_connection = pg.connect("dbname=lcp user=excalibur")



In [2025]:

    
psql_cursor = psql_lcp_connection.cursor()



In [2026]:

    
psql_cursor.execute("select * from wares")



In [2027]:

    
psql_cursor.fetchone()









    Out[2027]:





(32,
 'Red White and Blue ware',
 None,
 '1800 - 1500 BCE',
 'The ware is characterized by the decoration of alternating wavy and straight lines painted in red, white and blue.',
 datetime.datetime(2012, 9, 17, 21, 28, 23, 515035),
 datetime.datetime(2014, 5, 13, 21, 7, 45, 547331),
 5,
 24,
 1,
 None,
 'Middle Bronze Age IIA-IIB',
 -1800,
 -1500,
 None)



In [2028]:

    
psql_cursor.close()



In [2029]:

    
wares_df = pd.io.sql.read_sql("select * from wares", psql_lcp_connection)
wares_df.head(1)









    Out[2029]:






  
    
      
      id
      name
      origin
      chron_range
      desc
      created_at
      updated_at
      user_id
      old_region_id
      privacy_status
      contributor_id
      period
      start_year
      end_year
      definition
    
  
  
    
      0
      32
      Red White and Blue ware
      None
      1800 - 1500 BCE
      The ware is characterized by the decoration of...
      2012-09-17 21:28:23.515035
      2014-05-13 21:07:45.547331
      5
      24
      1
      None
      Middle Bronze Age IIA-IIB
      -1800
      -1500
      None



In [2030]:

    
wares_df.describe()









    Out[2030]:






  
    
      
      id
      user_id
      old_region_id
      privacy_status
      start_year
      end_year
    
  
  
    
      count
      224.000000
      222.000000
      85.000000
      224.000000
      155.000000
      155.000000
    
    
      mean
      148.602679
      27.234234
      12.729412
      1.263393
      -1253.290323
      193.580645
    
    
      std
      69.727324
      36.824263
      7.698776
      0.654271
      10503.905823
      848.788066
    
    
      min
      21.000000
      1.000000
      1.000000
      1.000000
      -92500.000000
      -2500.000000
    
    
      25%
      88.750000
      5.000000
      8.000000
      1.000000
      -800.000000
      -300.500000
    
    
      50%
      148.500000
      13.000000
      13.000000
      1.000000
      0.000000
      99.000000
    
    
      75%
      205.250000
      28.000000
      19.000000
      1.000000
      435.000000
      714.500000
    
    
      max
      271.000000
      165.000000
      24.000000
      4.000000
      2450.000000
      2250.000000



In [2031]:

    
wares_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 224 entries, 0 to 223
Data columns (total 15 columns):
id                224 non-null int64
name              224 non-null object
origin            0 non-null object
chron_range       223 non-null object
desc              223 non-null object
created_at        224 non-null datetime64[ns]
updated_at        224 non-null datetime64[ns]
user_id           222 non-null float64
old_region_id     85 non-null float64
privacy_status    224 non-null int64
contributor_id    0 non-null object
period            67 non-null object
start_year        155 non-null float64
end_year          155 non-null float64
definition        11 non-null object
dtypes: datetime64[ns](2), float64(4), int64(2), object(7)
memory usage: 28.0+ KB



In [2032]:

    
name_start_year_df = wares_df.loc[wares_df['start_year'].notnull(),['name','start_year']]
print name_start_year_df.count()
name_start_year_df.head(1)









    



name          155
start_year    155
dtype: int64






    Out[2032]:






  
    
      
      name
      start_year
    
  
  
    
      0
      Red White and Blue ware
      -1800



In [2033]:

    
name_start_year_df = name_start_year_df.sort('start_year')



In [2034]:

    
unique_start_years = name_start_year_df['start_year'].unique()



In [2035]:

    
sns.boxplot(unique_start_years, names=['start_years'], vert=False)
plt.show()



In [2036]:

    
sns.boxplot(unique_start_years[1:], names=['start_years'], vert=False)
plt.show()



In [2037]:

    
start_year_counts = wares_df['start_year'].value_counts()
start_year_counts.head()









    Out[2037]:





 1100    13
-200      9
 100      8
 0        7
 800      7
dtype: int64



In [2038]:

    
plt.plot(wares_df['start_year'])
plt.show()



In [2039]:

    
plt.plot(wares_df[wares_df['start_year'] != wares_df['start_year'].min()]['start_year'])
plt.show()



In [2040]:

    
wares_df['period'].value_counts().head(10)









    Out[2040]:





                             19
Roman, Byzantine              5
Hellenistic                   3
Iron Age                      3
Early Iron Age I              2
Early Islamic                 2
Mamluk period                 2
Late Roman - Byzantine        1
Middle Bronze Age IIA-IIB     1
Hellenistic, Roman            1
dtype: int64



In [2041]:

    
desc_wares_df = wares_df[(wares_df['desc'] != "") & (wares_df['desc'].notnull())]
print desc_wares_df.shape
desc_wares_df.head(1)









    



(150, 15)






    Out[2041]:






  
    
      
      id
      name
      origin
      chron_range
      desc
      created_at
      updated_at
      user_id
      old_region_id
      privacy_status
      contributor_id
      period
      start_year
      end_year
      definition
    
  
  
    
      0
      32
      Red White and Blue ware
      None
      1800 - 1500 BCE
      The ware is characterized by the decoration of...
      2012-09-17 21:28:23.515035
      2014-05-13 21:07:45.547331
      5
      24
      1
      None
      Middle Bronze Age IIA-IIB
      -1800
      -1500
      None



In [2042]:

    
desc_words = nltk.wordpunct_tokenize(str(desc_wares_df['desc'].values))
print "num of words: ", len(desc_words)
print desc_words[:10]









    



num of words:  14188
['[', "'", 'The', 'ware', 'is', 'characterized', 'by', 'the', 'decoration', 'of']



In [2043]:

    
desc_vocab = set(desc_words)
print "num of vocab: ", len(desc_vocab)









    



num of vocab:  1892



In [2044]:

    
freq_dist = nltk.FreqDist(desc_words)
freq_dist









    Out[2044]:





FreqDist({',': 679, '.': 527, 'and': 435, 'the': 398, 'of': 296, '-': 290, '\\': 250, '/': 237, "'": 220, '(': 218, ...})



In [2045]:

    
freq_dist.tabulate(15)









    



   ,    .  and  the   of    -    \    /    '    (   is   to    a with   in 
 679  527  435  398  296  290  250  237  220  218  212  208  208  193  150



In [2046]:

    
freq_dist['the']









    Out[2046]:





398



In [2047]:

    
from nltk.corpus import stopwords



In [2048]:

    
stopwords = stopwords.words('english')
stopwords[:10]









    Out[2048]:





[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your']



In [2049]:

    
freq_dist.plot(20)



In [2050]:

    
freq_dist.plot(20, cumulative=True)



In [2051]:

    
desc_no_stopwords = [x.lower() for x in desc_words if x.lower() not in stopwords]
print "num of words: ", len(desc_no_stopwords)
print desc_no_stopwords[:10]









    



num of words:  10598
['[', "'", 'ware', 'characterized', 'decoration', 'alternating', 'wavy', 'straight', 'lines', 'painted']



In [2052]:

    
import string
desc_nopunc_words = [x for x in desc_no_stopwords if x not in list(string.punctuation)]
print "num of words: ", len(desc_nopunc_words)
print desc_nopunc_words[:10]









    



num of words:  7929
['ware', 'characterized', 'decoration', 'alternating', 'wavy', 'straight', 'lines', 'painted', 'red', 'white']



In [2053]:

    
desc_vocab = set(desc_nopunc_words)
print "num of vocab: ", len(desc_vocab)









    



num of vocab:  1626



In [2054]:

    
freq_dist = nltk.FreqDist(desc_nopunc_words)
freq_dist









    Out[2054]:





FreqDist({'red': 157, '6': 138, 'inclusions': 129, 'brown': 120, '4': 120, '5yr': 102, 'r': 102, 'fabric': 99, '2': 91, 'white': 84, ...})



In [2055]:

    
freq_dist.plot(20)



In [2056]:

    
freq_dist.plot(20, cumulative=True)



In [2057]:

    
desc_bigrams = nltk.bigrams(desc_nopunc_words)



In [2058]:

    
freq_dist = nltk.FreqDist(desc_bigrams)
freq_dist









    Out[2058]:





FreqDist({('2', '5yr'): 36, ('r', 'n'): 33, ('5yr', '6'): 30, ('6', '6'): 29, ('red', 'brown'): 27, (';\\', 'r'): 26, ('5', 'yr'): 24, ('5yr', '5'): 24, ('6', '8'): 24, ('4', '6'): 24, ...})



In [2059]:

    
freq_dist.plot(20, cumulative=True)



In [2060]:

    
desc_words_no_nums = [x.lower() for x in desc_nopunc_words if not x.isdigit()]
print "num of words: ", len(desc_words_no_nums)
print desc_words_no_nums[:10]









    



num of words:  7209
['ware', 'characterized', 'decoration', 'alternating', 'wavy', 'straight', 'lines', 'painted', 'red', 'white']



In [2061]:

    
desc_vocab = set(desc_words_no_nums)
print "num of vocab: ", len(desc_vocab)









    



num of vocab:  1586



In [2062]:

    
desc_bigrams = nltk.bigrams(desc_words_no_nums)



In [2063]:

    
freq_dist = nltk.FreqDist(desc_bigrams)
freq_dist









    Out[2063]:





FreqDist({('r', 'n'): 33, ('red', 'brown'): 27, (';\\', 'r'): 26, ('5yr', '5yr'): 24, ('xe2', 'x80'): 24, ('reddish', 'brown'): 22, ('.\\', 'r'): 19, ('iron', 'age'): 18, ('gray', 'core'): 16, ('n', 'r'): 15, ...})



In [2064]:

    
plt.figure(figsize=(10,5))
freq_dist.plot(40, cumulative=True)



In [2065]:

    
descriptions = nltk.Text(desc_words)



In [2066]:

    
descriptions.count('ware')









    Out[2066]:





57



In [2067]:

    
descriptions.concordance('ware')









    



Displaying 25 of 69 matches:
                                     ware is characterized by the decoration o
y core ( 2 . 5YR or 5YR 4 / 1 ). The ware gets its name from the thick , slopp
many ( but not all ) vessels in this ware . This slip is matte and ranges in c
thin , reddish - orange , hard fired ware , which is produced from a highly le
 ." Please see Nabataean Semi - Fine ware . \ r \ n \ r \ nNFW is aestheticall
listine forms .' ' Early Roman Jiyeh Ware ( ERJW ) shows close affinity with t
ur ranging from gray to black . This ware contains very few lime inclusions co
thin , reddish - orange , hard fired ware , which is produced from a highly le
phase 3c is of a thicker and coarser ware and the lines that are common in Dek
ck , black paint .' ' White or light ware , with painted decoration in a singl
listine arrival in Canaan . ' ' This ware has a darker fabric than Philistin M
onze Age in Cyprus . The label Plain Ware is used to group a very large array 
sions .' ' Coarse , very low quality ware , with many inclusions and poorly si
thin , reddish - orange , hard fired ware , which is produced from a highly le
reduced to a dark grey - black . The ware is usually very thin - walled , and 
erves the epithet \ xc2 \ xb4brittle ware \ xe2 \ x80 \ x99 ( as coined by Dys
sions .' ' Vessels are made of crude ware , and decorated with painted geometr
it to be similar to that of the Acre Ware , with a light brown 7 . 5 YR 6 / 4 
or was treated similarly to the Acre Ware vessels , with a light - colored sli
thin , reddish - orange , hard fired ware , which is produced from a highly le
 of the vessel . ' ' Coarse handmade ware , generally of desert origin and aff
periods , the term " Handmade Arabah Ware " is more appropriate ( cf . Martin 
se holemouth jars , made from coarse ware , similar to cooking pots of the lat
 by fire .' ' Characteristic of this ware is exterior paint on Aila ( Aqaba ) 
 is exterior paint on Aila ( Aqaba ) ware vessels . The paint can vary in colo



In [2068]:

    
descriptions.collocations()









    



Iron Age; Stephan Schmid; carefully manufactured; less carefully; gray
core; manufactured examples; highly levigated; occasionally display;
shell thin; sodium feldspars; reddish yellow; irregular fracture;
painted design; small white; NPFW occasionally; levigated clay;
display inclusions; reddish brown; hard fired; iron oxides



In [2069]:

    
descriptions.dispersion_plot(['ware', 'red', 'brown', 'white'])

Use pandas



In [2070]:

    
# character counts
[len(desc) for desc in desc_wares_df['desc']][:10]









    Out[2070]:





[114, 555, 564, 311, 335, 126, 255, 529, 112, 158]



In [2071]:

    
# word counts
[len(desc.split(' ')) for desc in desc_wares_df['desc']][:10]









    Out[2071]:





[19, 99, 102, 54, 58, 19, 38, 77, 18, 21]



In [2072]:

    
plt.figure(figsize=(10,5))
plt.hist([len(desc.split(' ')) for desc in desc_wares_df['desc']], bins=30)
plt.xlabel('description lengths')
plt.ylabel('frequencies')
plt.show()



In [2073]:

    
desc_wares_df.loc[:,'desc'] = desc_wares_df.loc[:,'desc'].str.lower()



In [2083]:

    
desc_wares_df['desc'].head()









    Out[2083]:





0    the ware is characterized by the decoration of...
4    the fabric is coarse and gritty. the color var...
6    moderately hard, coarse fabric with many small...
7    bassit imperial amphorae fabric (hayes 1991, f...
8    a27.1 bassit imperial amphora fabric black san...
Name: desc, dtype: object



In [2138]:

    
desc_nopunc_df = desc_wares_df.replace(to_replace={"desc":{"\W".format(stop_words):" "}}, regex=True)
desc_nopunc_df['desc'].head()









    Out[2138]:





0    the ware is characterized by the decoration of...
4    the fabric is coarse and gritty  the color var...
6    moderately hard  coarse fabric with many small...
7    bassit imperial amphorae fabric  hayes 1991  f...
8    a27 1 bassit imperial amphora fabric black san...
Name: desc, dtype: object



In [2139]:

    
stop_words = "|".join([sw.encode('ascii') for sw in stopwords])
stop_words









    Out[2139]:





'i|me|my|myself|we|our|ours|ourselves|you|your|yours|yourself|yourselves|he|him|his|himself|she|her|hers|herself|it|its|itself|they|them|their|theirs|themselves|what|which|who|whom|this|that|these|those|am|is|are|was|were|be|been|being|have|has|had|having|do|does|did|doing|a|an|the|and|but|if|or|because|as|until|while|of|at|by|for|with|about|against|between|into|through|during|before|after|above|below|to|from|up|down|in|out|on|off|over|under|again|further|then|once|here|there|when|where|why|how|all|any|both|each|few|more|most|other|some|such|no|nor|not|only|own|same|so|than|too|very|s|t|can|will|just|don|should|now'



In [2142]:

    
desc_less_df = desc_nopunc_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()









    Out[2142]:





0     ware characterized the decoration alternating...
4     fabric coarse gritty color varies red  2 5 yr...
6    moderately hard  coarse fabric many small larg...
7    bassit imperial amphorae fabric  hayes 1991  f...
8    a27 1 bassit imperial amphora fabric black san...
Name: desc, dtype: object



In [2143]:

    
# run twice cause it's late
desc_less_df = desc_less_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()









    Out[2143]:





0     ware characterized decoration alternating wav...
4     fabric coarse gritty color varies red  2 5 yr...
6    moderately hard  coarse fabric many small larg...
7    bassit imperial amphorae fabric  hayes 1991  f...
8    a27 1 bassit imperial amphora fabric black san...
Name: desc, dtype: object



In [2150]:

    
# run thrice cause it's late
desc_less_df = desc_less_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()









    Out[2150]:





0     ware characterized decoration alternating wav...
4     fabric coarse gritty color varies red  2 5 yr...
6    moderately hard  coarse fabric many small larg...
7    bassit imperial amphorae fabric  hayes 1991  f...
8    a27 1 bassit imperial amphora fabric black san...
Name: desc, dtype: object



In [2151]:

    
desc_less_df[desc_less_df['desc'].str.contains(' the ')]['desc']









    Out[2151]:





Series([], Name: desc, dtype: object)



In [2157]:

    
desc_less_df['desc']









    Out[2157]:





0       ware characterized decoration alternating wav...
4       fabric coarse gritty color varies red2 5 yr 5...
6      moderately hardcoarse fabric many small large ...
7      bassit imperial amphorae fabrichayes 1991fig 6...
8      a27 1 bassit imperial amphora fabric black san...
9      vessels grey white color granular texturesmall...
10     denseclean light pinkish brown5yr 7 4 7 5yr 8 ...
12     nfw egg shell thinreddish orangehard fired war...
13     heavy red slip vertical burnishing horizontal ...
14     often red slipped black painted decorationsimp...
15     early roman jiyeh wareerjwshows close affinity...
16     npfw egg shell thinreddish orangehard fired wa...
17     white light ware painted decoration single col...
18      ware darker fabric philistin monochrome devel...
19     whitish light brown fabricoccassionally pinkis...
20     plain wares represent major class undecorated ...
22     coarse low quality ware many inclusions poorly...
23     a37 later bassit amphorae fabricc fm32 1o33 3 ...
24                      hardgrittydark grey black fabric
25     m32 bassit mortaria fabrichayes 1967 red brown...
27     m32 1 hard dark brown fabric irregular fractur...
29     n32 oxidisedred dark red fabric hard irregular...
30     n34 oxidised red fabric hard irregular fractur...
31     whitish pinkish fine well levidated clayself s...
32     hellenistic cypriot group 1matrix color ranges...
33      fabric calcareous sandy sometimes mildly poro...
36     reddish brown brown exteriorgray black core re...
37                                          painted pots
38      cleanmoderately harddense light pinkish brown...
39     vessel designed transported seaelongated conic...
                             ...                        
190    moderately hardextremely coarse fabric many me...
191    hardbrick redcoarse fabric10r 5 4moderate poro...
192    hardfinebuff fabric2 5y 7 4 low porosity5 smal...
193    friable greyish white stonepaste inclusionstra...
195    fine reddish yellow7 5yr6 65yr5 6 yellowish br...
196    vessels generally fully fired pale pink brown ...
197    hardcoarsedark brown fabricsurface7 5yr 4 3n b...
198     typical iron age mortarium straight everted w...
199    dominant style cypro geometric periodattested ...
200    fabric varies colourhardness porosity compared...
201    reworked marl taqiye formationreddish yellowra...
202    hardcoarsegrey fabric10yr 5 1grey middlelight ...
203    hardfineorange red fabric2 5yr 6 65yr 6 6low p...
204    npfw egg shell thinreddish orangehard fired wa...
205     matrix reddishclayeyoptically active silty si...
206     ware classified unique physical attribute pla...
207     tell el yahudiya ware characterized distincti...
208    npfw egg shell thinreddish orangehard fired wa...
209    dominant ware middle late cypro geometric peri...
210     fabric ware made nile silt naturally abundant...
211     npfw egg shell thinreddish orangehard fired w...
212    npfw egg shell thinreddish orangehard fired wa...
213    dominant ware early cypro archaic period attes...
214     thick walledcoarse sandy fabric varies pink r...
215     thick walledcoarse sandy fabric varies buff b...
216     fine thin walled ware colors varying buff lig...
217     thin walledbrick red ware used make cooking v...
220    fpf fabric defined conspicuous coarse rounded ...
221     fabric quite hard fairly gritty many smallmed...
223    open carinated bowls made using brownsoil deri...
Name: desc, dtype: object

want to compare each desc to each other; need to keep them separated, unlike below (maybe just in pandas, go lowercase, replace punc, and replace stopwords with nadda); also, as seen below, remove those double puncs that get through using regex (e.g., ".'")



In [2001]:

    
desc_words_no_nums









    Out[2001]:





['ware',
 'characterized',
 'decoration',
 'alternating',
 'wavy',
 'straight',
 'lines',
 'painted',
 'red',
 'white',
 'blue',
 ".'",
 'fabric',
 'coarse',
 'gritty',
 'color',
 'varies',
 'red',
 'yr',
 'yellowish',
 'red',
 'yr',
 'brown',
 'yr',
 'dark',
 'brown',
 'yr',
 'black',
 'limestone',
 'grits',
 'inclusions',
 'sand',
 'white',
 'spots',
 'due',
 'vitrification',
 'vessels',
 'light',
 'colored',
 'exterior',
 'ranges',
 'white',
 'y',
 'light',
 'red',
 'yr',
 '),',
 'pale',
 'brown',
 'yr',
 'light',
 'reddish',
 'brown',
 'yr',
 ').',
 'lighter',
 'colored',
 'exterior',
 'apparently',
 'achieved',
 'dipping',
 'vessels',
 'salt',
 'water',
 'firing',
 'kiln',
 ".'",
 'moderately',
 'hard',
 'coarse',
 'fabric',
 'many',
 'small',
 'large',
 'white',
 'gray',
 'brown',
 'red',
 'inclusions',
 'color',
 'fabric',
 'slip',
 'highly',
 'variable',
 'unslipped',
 'surfaces',
 'fired',
 'purplish',
 'brown',
 '5yr',
 '5yr',
 'pink',
 '10r',
 '),',
 'orange',
 '5yr',
 '),',
 'pale',
 'yellow',
 '10yr',
 ').',
 'often',
 'wide',
 'gray',
 'core',
 '5yr',
 '5yr',
 ').',
 'ware',
 'gets',
 'name',
 'thick',
 'sloppily',
 'applied',
 'slip',
 'partially',
 'coats',
 'many',
 'vessels',
 'ware',
 'slip',
 'matte',
 'ranges',
 'color',
 'red',
 'brown',
 '10r',
 '5yr',
 '5yr',
 'pink',
 '5yr',
 ").'",
 'bassit',
 'imperial',
 'amphorae',
 'fabric',
 'hayes',
 'fig',
 'cf',
 'reynolds',
 'figs',
 'cat',
 '.)',
 'red',
 'oxidized',
 'fabric',
 'hard',
 'fine',
 'fracture',
 'sandy',
 'feel',
 'common',
 'poorly',
 'sorted',
 'angular',
 'black',
 'sand',
 '1mm',
 'common',
 'lime',
 '2mm',
 'moderate',
 'white',
 'quartz',
 'occasional',
 'red',
 'stone',
 '1mm',
 ".'",
 'a27',
 'bassit',
 'imperial',
 'amphora',
 'fabric',
 'black',
 'sand',
 'variant',
 'c',
 'f',
 'o33',
 'hard',
 'brown',
 'fabric',
 'irregular',
 'fracture',
 'sandy',
 'gritty',
 'feel',
 'inclusions',
 'common',
 'sub',
 'rounded',
 'black',
 'sand',
 'mm',
 'moderate',
 'subanglular',
 'lime',
 '2mm',
 'occasional',
 'quartz',
 '5mm',
 'sandy',
 'matrix',
 'moderated',
 'fine',
 'silver',
 'mica',
 ".'",
 'vessels',
 'grey',
 'white',
 'color',
 'granular',
 'texture',
 'small',
 'occasional',
 'medium',
 'angular',
 'lime',
 'inclusions',
 'quartz',
 'sand',
 ".'",
 'dense',
 'clean',
 'light',
 'pinkish',
 'brown',
 '5yr',
 '5yr',
 '),',
 'slightly',
 'granular',
 'fine',
 'lime',
 'inclusions',
 'fully',
 'fired',
 'matte',
 'semi',
 'lustrous',
 'smooth',
 'orange',
 'red',
 'slip',
 'brushed',
 'usually',
 'covering',
 'entire',
 'interior',
 'upper',
 'exterior',
 'dribbling',
 'onto',
 'lower',
 'wall',
 'nfw',
 'egg',
 'shell',
 'thin',
 'reddish',
 'orange',
 'hard',
 'fired',
 'ware',
 'produced',
 'highly',
 'levigated',
 'clay',
 'later',
 'less',
 'carefully',
 'manufactured',
 'examples',
 'nfw',
 'occasionally',
 'display',
 'inclusions',
 'quartz',
 'limestone',
 'could',
 'indicative',
 'mass',
 'production',
 '."',
 'please',
 'see',
 'nabataean',
 'semi',
 'fine',
 'ware',
 'r',
 'n',
 'r',
 'nnfw',
 'aesthetically',
 'pleasing',
 'produced',
 'mimic',
 'eastern',
 'western',
 'sigillatas',
 'metal',
 'later',
 'glass',
 'table',
 'wares',
 'nfw',
 'represented',
 'forms',
 'e',
 'g',
 'cups',
 'jugs',
 'juglets',
 'jars',
 'unguentaria',
 'bowls',
 'plates',
 ".'",
 'heavy',
 'red',
 'slip',
 'vertical',
 'burnishing',
 'horizontal',
 'black',
 'white',
 'lines',
 'decorating',
 'body',
 'vessel',
 ".'",
 'often',
 'red',
 'slipped',
 'black',
 'painted',
 'decoration',
 'simple',
 'philistine',
 'motifs',
 'e',
 'g',
 'spirals',
 'sometimes',
 'undecorated',
 'keeping',
 'classical',
 'philistine',
 'forms',
 ".'",
 'early',
 'roman',
 'jiyeh',
 'ware',
 'erjw',
 'shows',
 'close',
 'affinity',
 'late',
 'hellenistic',
 'material',
 'yet',
 'diversity',
 'two',
 'groups',
 'colour',
 'admixture',
 'made',
 'identification',
 'group',
 'easy',
 'erjw',
 'characterised',
 'surface',
 'reddish',
 'brown',
 'colour',
 'yr',
 '10r',
 '),',
 'thin',
 'fracture',
 'core',
 'colour',
 'ranging',
 'gray',
 'black',
 'ware',
 'contains',
 'lime',
 'inclusions',
 'compared',
 'lhjw',
 'addition',
 'also',
 'contains',
 'gray',
 'black',
 'particles',
 'however',
 'ceramic',
 'body',
 'contains',
 'less',
 'sand',
 'material',
 'previous',
 'period',
 'also',
 'appears',
 'compact',
 'worth',
 'mention',
 'material',
 'quite',
 'resemblance',
 'classic',
 'beirut',
 'clay',
 'also',
 'mentioned',
 'early',
 'roman',
 'production',
 'identical',
 'utility',
 'vessel',
 'groups',
 'ones',
 'earlier',
 'production',
 'distinguished',
 'however',
 'diminished',
 'diversity',
 'specific',
 'types',
 'forms',
 'utility',
 'vessel',
 'groups',
 'noticeable',
 'npfw',
 'egg',
 'shell',
 'thin',
 'reddish',
 'orange',
 'hard',
 'fired',
 'ware',
 'produced',
 'highly',
 'levigated',
 'clay',
 'later',
 'less',
 'carefully',
 'manufactured',
 'examples',
 'npfw',
 'occasionally',
 'display',
 'inclusions',
 'quartz',
 'limestone',
 'vessel',
 'finished',
 'painted',
 'design',
 'designs',
 'categorized',
 'dekorphase',
 'stephan',
 'schmid',
 'dekorphase',
 '3c',
 'thicker',
 'coarser',
 'ware',
 'lines',
 'common',
 'dekorphases',
 '3a',
 '3b',
 'disappeared',
 'instead',
 'palm',
 'leaf',
 'pomegranate',
 'patterns',
 'applied',
 'thick',
 'black',
 'paint',
 ".'",
 'white',
 'light',
 'ware',
 'painted',
 'decoration',
 'single',
 'color',
 'particular',
 'motifs',
 'linked',
 'philistine',
 'arrival',
 'canaan',
 'ware',
 'darker',
 'fabric',
 'philistin',
 'monochrome',
 'develops',
 'vessels',
 'thick',
 'white',
 'slip',
 'red',
 'black',
 'decoration',
 'motifs',
 'continue',
 'seen',
 'philistine',
 'monochrome',
 'variation',
 'spirals',
 'checkerboards',
 'birds',
 'fish',
 'crosses',
 'common',
 ".'",
 'whitish',
 'light',
 'brown',
 'fabric',
 'occassionally',
 'pinkish',
 'well',
 'levigated',
 'brown',
 'decoration',
 'plain',
 'wares',
 'represent',
 'major',
 'class',
 'undecorated',
 'pottery',
 'late',
 'bronze',
 'age',
 'cyprus',
 'label',
 'plain',
 'ware',
 'used',
 'group',
 'large',
 'array',
 'vessel',
 'types',
 'fabrics',
 'mostly',
 'made',
 'coarse',
 'grained',
 'fabrics',
 'full',
 'inclusions',
 ".'",
 'coarse',
 'low',
 'quality',
 'ware',
 'many',
 'inclusions',
 'poorly',
 'sifted',
 'painted',
 'various',
 'scenes',
 'paint',
 'brown',
 'red',
 'earlier',
 'stages',
 'deriving',
 'earlier',
 'imported',
 'bichrome',
 'late',
 'bronze',
 'age',
 'usually',
 'red',
 'lb',
 'iib',
 '13th',
 'century',
 'bce',
 ').',
 'tradition',
 'continues',
 'iron',
 'age',
 'alongside',
 'philistine',
 'pottery',
 'although',
 'smaller',
 'quantities',
 'painted',
 'decorations',
 'include',
 'division',
 'triglyphs',
 'metopes',
 'ibexes',
 'flanking',
 'palm',
 'tree',
 'animals',
 'plants',
 'simpler',
 'lines',
 'lipstick',
 'red',
 'line',
 'paint',
 'along',
 'rim',
 ".'",
 'a37',
 'later',
 'bassit',
 'amphorae',
 'fabric',
 'c',
 'f',
 'm32',
 'o33',
 'oxidised',
 'red',
 'amphora',
 'fabric',
 'hard',
 'irregular',
 'fracture',
 'ad',
 'coarse',
 'feel',
 'inclusions',
 'common',
 'sub',
 'angular',
 'basalt',
 'mm',
 'common',
 'sub',
 'angular',
 'lime',
 '2mm',
 'occasional',
 'red',
 'stone',
 '.\\',
 'r',
 'n',
 'r',
 'nthe',
 'date',
 'distribution',
 'implies',
 'production',
 'commencing',
 'early',
 '3rd',
 'century',
 'presence',
 'sixth',
 'century',
 'forms',
 'include',
 'early',
 'fifth',
 'century',
 'lra1',
 'copies',
 'also',
 'note',
 'group',
 'late',
 '2nd',
 '?-',
 '3rd',
 'c',
 'ce',
 'xe2',
 'x80',
 'x98transport',
 'dolia',
 'xe2',
 'x80',
 'x99',
 'fabric',
 'hard',
 'gritty',
 'dark',
 'grey',
 'black',
 'fabric',
 'm32',
 'bassit',
 'mortaria',
 'fabric',
 'hayes',
 ').',
 'red',
 'brown',
 'fabric',
 'hard',
 'irregular',
 'fracture',
 'harsh',
 'sandy',
 'feel',
 'inclusions',
 'common',
 'sub',
 'angular',
 'quartz',
 '4mm',
 'common',
 'black',
 'sand',
 '5mm',
 'rounded',
 'lime',
 '1mm',
 ".'",
 'm32',
 'hard',
 'dark',
 'brown',
 'fabric',
 'irregular',
 'fracture',
 'harsh',
 'sandy',
 'feel',
 'inclusions',
 'abundant',
 'basalt',
 '6mm',
 'common',
 'sub',
 'rounded',
 'lime',
 'c',
 '5mm',
 'sandy',
 'matrix',
 ".'",
 'n32',
 'oxidised',
 'red',
 'dark',
 'red',
 'fabric',
 'hard',
 'irregular',
 'fracture',
 'sandy',
 'feel',
 'inclusions',
 'common',
 'quartz',
 '2mm',
 'black',
 'sand',
 '2mm',
 'occasional',
 'lime',
 'red',
 'stone',
 'n34',
 'oxidised',
 'red',
 'fabric',
 'hard',
 'irregular',
 'fracture',
 'sandy',
 'feel',
 'inclusions',
 'common',
 'subrounded',
 'lime',
 '8mm',
 'moderate',
 'black',
 'sand',
 '3mm',
 '.\\',
 'r',
 'n',
 'r',
 'nthis',
 'fabric',
 'would',
 'appear',
 'used',
 'later',
 'dolia',
 ".'",
 'whitish',
 'pinkish',
 'fine',
 'well',
 'levidated',
 'clay',
 'self',
 'slip',
 'dark',
 'brown',
 'reddish',
 'monochrome',
 'decoration',
 '.\\',
 'r',
 'n',
 'hellenistic',
 'cypriot',
 'group',
 'matrix',
 'color',
 'ranges',
 'orange',
 'red',
 'according',
 'firing',
 'temperature',
 'level',
 'grey',
 'black',
 'red',
 'grits',
 'white',
 'yellowish',
 'lime',
 'grits',
 'may',
 'occur',
 'section',
 'matrix',
 'slip',
 'well',
 'sticked',
 'clay',
 'orange',
 'reddish',
 'coloured',
 'often',
 'lime',
 'spots',
 'visible',
 'surface',
 ".'",
 'fabric',
 'calcareous',
 'sandy',
 'sometimes',
 'mildly',
 'porous',
 'main',
 'tempers',
 'characteristic',
 'well',
 'sorted',
 'costal',
 'quartz',
 'sand',
 '(~',
 '%),',
 'poorly',
 'sorted',
 'lime',
 'stone',
 'carmel',
 'kurkar',
 'fragments',
 'sea',
 'shells',
 'r',
 'nthe',
 'clay',
 'crumbly',
 'nature',
 'firing',
 'process',
 'often',
 'leaves',
 'dark',
 'core',
 'could',
 'suggest',
 'medium',
 'level',
 'firing',
 'jars',
 'display',
 'regular',
 'surface',
 'treatment',
 'decoration',
 'r',
 'n',
 'r',
 'nflasks',
 'fabric',
 'calcareous',
 'sandy',
 'sometimes',
 'mildly',
 'porous',
 'main',
 'tempers',
 'well',
 'sorted',
 'coastal',
 'quartz',
 'sand',
 '(~',
 '%),',
 'poorly',
 'sorted',
 'limestone',
 'carmel',
 'kurkar',
 'fragments',
 'sea',
 'shells',
 'r',
 'n',
 'r',
 'nthe',
 'clay',
 'crumbly',
 'nature',
 'firing',
 'process',
 'often',
 'leaves',
 'thin',
 'strip',
 'oxidized',
 'reddish',
 'fabric',
 'leaving',
 'dark',
 'suggests',
 'medium',
 'level',
 'firing',
 'air',
 'reached',
 'inner',
 'part',
 'flask',
 'firing',
 'process',
 '.\\',
 'r',
 'n',
 'r',
 'nflasks',
 'polished',
 'slipped',
 'concentric',
 'decoration',
 'red',
 'red',
 'black',
 'typological',
 'chronological',
 'discussion',
 'flasks',
 'see',
 'gilboa',
 'et',
 'al',
 'r',
 'n',
 ...]

then NN this jank

https://docs.google.com/presentation/d/102TFe5dAmUXja_Ft31z__NUUuY7PZG9AkV7Qn_TwPXg/edit#slide=id.g58471b318_01

	id	user_id	old_region_id	privacy_status	start_year	end_year
count	224.000000	222.000000	85.000000	224.000000	155.000000	155.000000
mean	148.602679	27.234234	12.729412	1.263393	-1253.290323	193.580645
std	69.727324	36.824263	7.698776	0.654271	10503.905823	848.788066
min	21.000000	1.000000	1.000000	1.000000	-92500.000000	-2500.000000
25%	88.750000	5.000000	8.000000	1.000000	-800.000000	-300.500000
50%	148.500000	13.000000	13.000000	1.000000	0.000000	99.000000
75%	205.250000	28.000000	19.000000	1.000000	435.000000	714.500000
max	271.000000	165.000000	24.000000	4.000000	2450.000000	2250.000000