In [2]:

    
import itertools as it
import sys
import xlrd

from os import path as p, chdir
from io import StringIO
from array import array

if 'examples' in p.abspath('.'):
    chdir('..')
    
from meza import io, process as pr, convert as cv, fntools as ft, stats

Load data, type cast, and write to a CSV file



In [3]:

    
# Create a csv file
text = (
    'col1,col2,col3\n'
    'hello,5/4/82,1\n'
    'one,1/1/15,2\n'
    'happy,7/4/92,3\n')

f = StringIO(text)
f.seek(0)

# Load the file
records = io.read_csv(f)

# Records are an iterator over the rows
row = next(records)
row









    Out[3]:





{'col1': 'hello', 'col2': '5/4/82', 'col3': '1'}



In [4]:

    
# Replace first row so as not to loose any data
records = pr.prepend(records, row)

# Guess column types. Note: `detect_types` returns a new `records`
# generator since it consumes rows during type detection
records, result = pr.detect_types(records)
result['types']









    Out[4]:





[{'id': 'col1', 'type': 'text'},
 {'id': 'col2', 'type': 'date'},
 {'id': 'col3', 'type': 'int'}]



In [5]:

    
# Now type cast the records. Note: most `meza.process` functions return
# generators, so lets wrap the result in a list to view the data
casted = list(pr.type_cast(records, **result))
casted[0]









    Out[5]:





{'col1': 'hello', 'col2': datetime.date(1982, 5, 4), 'col3': 1}

now run some operations on the type casted data



In [6]:

    
# Cut out the first column of data
cut_recs = pr.cut(casted, ['col1'], exclude=True)
cut_recs









    Out[6]:





<generator object cut.<locals>.<genexpr> at 0x103d0adb0>



In [7]:

    
# merge the cut rows to and get the max value of the remaining columns. 
# Note: since `merge` (by definition) will always contain just one row,
# it is returned as is (not wrapped in a generator)
merged = pr.merge(cut_recs, pred=bool, op=max)
merged









    Out[7]:





{'col2': datetime.date(2015, 1, 1), 'col3': 3}



In [8]:

    
# Now write merged data back to a new csv file.
f = StringIO()
io.write(f, cv.records2csv([merged]))
f.getvalue()









    Out[8]:





'col2,col3\r\n2015-01-01,3\r\n'

Reading data

meza can read both filepaths and file-like objects. Additionally, all readers return equivalent records iterators, i.e., a generator of dictionaries with keys corresponding to the column names. Please see Readers for a complete list of available readers and recognized file types.



In [9]:

    
# Read a file like object and de-duplicate the header
f = StringIO('col,col\nhello,world\n')
records = io.read_csv(f, dedupe=True)

# View the first row
next(records)









    Out[9]:





{'col': 'hello', 'col_2': 'world'}

Processing data

Numerical analysis (à la pandas)

In the following example, pandas equivalent methods are preceded by -->.



In [10]:

    
# Create some data in the same structure as what the various `read...`
# functions output
header = ['A', 'B', 'C', 'D']
data = [
    [0.5607, 0.9338, 0.4769, 0.7804],
    [0.8227, 0.2844, 0.8166, 0.7323],
    [0.4627, 0.8633, 0.3283, 0.1909],
    [0.3932, 0.5454, 0.9604, 0.6376],
    [0.3685, 0.9166, 0.9457, 0.8066],
    [0.7584, 0.6981, 0.5625, 0.3578],
    [0.8959, 0.6932, 0.2565, 0.3378]]

df = [dict(zip(header, d)) for d in data]
df[0]









    Out[10]:





{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804}



In [11]:

    
# Sort records by the value of column `B` --> df.sort_values(by='B')
next(pr.sort(df, 'B'))









    Out[11]:





{'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323}



In [12]:

    
# Select column `A` --> df['A']
next(pr.cut(df, ['A']))









    Out[12]:





{'A': 0.5607}



In [13]:

    
# Select the first 3 rows of data --> df[0:3]
list(it.islice(df, 3))









    Out[13]:





[{'A': 0.5607, 'B': 0.9338, 'C': 0.4769, 'D': 0.7804},
 {'A': 0.8227, 'B': 0.2844, 'C': 0.8166, 'D': 0.7323},
 {'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}]



In [14]:

    
# Select all data whose value for column `A` is less than 0.5 --> df[df.A < 0.5]
next(pr.tfilter(df, 'A', lambda x: x < 0.5))









    Out[14]:





{'A': 0.4627, 'B': 0.8633, 'C': 0.3283, 'D': 0.1909}

Note: since aggregate and merge (by definition) return just one row, they return them as is (not wrapped in a generator).



In [15]:

    
# Calculate the mean of column `A` across all data --> df.mean()['A']
pr.aggregate(df, 'A', stats.mean)['A']









    Out[15]:





0.6088714285714286



In [16]:

    
# Calculate the sum of each column across all data --> df.sum()
pr.merge(df, pred=bool, op=sum)









    Out[16]:





{'A': 4.2621, 'B': 4.9348, 'C': 4.3469, 'D': 3.8434}

Text processing (à la csvkit)

In the following example, csvkit equivalent commands are preceded by -->.



In [17]:

    
# First create a few simple csv files
f1 = StringIO('col_1,col_2,col_3\n1,dill,male\n2,bob,male\n3,jane,female\n')
f2 = StringIO('col_1,col_2,col_3\n4,tom,male\n5,dick,male\n6,jill,female\n')
f1.seek(0)
f2.seek(0)









    Out[17]:





0

Now we can read the files, manipulate the data, convert the manipulated data to json, and write the json back to a new file. Also, note that since all readers return equivalent records iterators, you can use them interchangeably (in place of read_csv) to open any supported file. E.g., read_xls, read_sqlite, etc.



In [18]:

    
# First, let's combine the files into one iterator --> csvstack file1.csv file2.csv
records = io.join(f1, f2, ext='csv')
next(records)









    Out[18]:





{'col_1': '1', 'col_2': 'dill', 'col_3': 'male'}



In [19]:

    
next(it.islice(records, 4, None))









    Out[19]:





{'col_1': '6', 'col_2': 'jill', 'col_3': 'female'}



In [20]:

    
# Now let's create a persistant records list so we can do some simple manipulations
f1.seek(0)
records = list(io.read_csv(f1))



In [21]:

    
# Sort records by the value of column `col_2` --> csvsort -c col_2 file1.csv
next(pr.sort(records, 'col_2'))









    Out[21]:





{'col_1': '2', 'col_2': 'bob', 'col_3': 'male'}



In [22]:

    
# Select column `col_2` --> csvcut -c col_2 file1.csv
next(pr.cut(records, ['col_2']))









    Out[22]:





{'col_2': 'dill'}



In [23]:

    
# Select all data whose value for column `col_2` contains `jan`
# --> csvgrep -c col_2 -m jan file1.csv
next(pr.grep(records, [{'pattern': 'jan'}], ['col_2']))









    Out[23]:





{'col_1': '3', 'col_2': 'jane', 'col_3': 'female'}



In [24]:

    
# Convert a csv file to json --> csvjson -i 4 file1.csv
f_json = StringIO()
io.write(f_json, cv.records2json(records))
f_json.getvalue()









    Out[24]:





'[{"col_2": "dill", "col_3": "male", "col_1": "1"}, {"col_2": "bob", "col_3": "male", "col_1": "2"}, {"col_2": "jane", "col_3": "female", "col_1": "3"}]'

Geo processing (à la mapbox)

In the following example, mapbox equivalent commands are preceded by -->.



In [25]:

    
# First create a geojson file
f = StringIO(
    '{"type": "FeatureCollection","features": ['
    '{"type": "Feature", "id": 11, "geometry": '
    '{"type": "Point", "coordinates": [10, 20]}},'
    '{"type": "Feature", "id": 12, "geometry": '
    '{"type": "Point", "coordinates": [5, 15]}}]}')

f.seek(0)









    Out[25]:





0

Now we can open the file, split the data by id, and finally convert the split data to a new geojson file-like object.



In [26]:

    
# Load the geojson file and peek at the results
records, peek = pr.peek(io.read_geojson(f))
peek[0]









    Out[26]:





{'id': 11, 'lat': 20, 'lon': 10, 'type': 'Point'}



In [27]:

    
# Split the records by feature ``id`` and select the first feature --> geojsplit -k id file.geojson
splits = pr.split(records, 'id')
feature_records, name = next(splits)
name









    Out[27]:





11



In [28]:

    
# Convert the feature records into a GeoJSON file-like object
geojson = cv.records2geojson(feature_records)
geojson.readline()









    Out[28]:





'{"type": "FeatureCollection", "bbox": [10, 20, 10, 20], "features": [{"type": "Feature", "id": 11, "geometry": {"type": "Point", "coordinates": [10, 20]}, "properties": {"id": 11}}], "crs": {"type": "name", "properties": {"name": "urn:ogc:def:crs:OGC:1.3:CRS84"}}}'

Note: you can also write back to a file as shown previously io.write('file.geojson', geojson).

Writing Data

meza can persist records to disk via the following functions:

meza.convert.records2csv
meza.convert.records2json
meza.convert.records2geojson

Each function returns a file-like object that you can write to disk via meza.io.write('/path/to/file', result).



In [29]:

    
# First let's create a simple tsv file like object
f = StringIO('col1\tcol2\nhello\tworld\n')
f.seek(0)

# Next create a records list so we can reuse it
records = list(io.read_tsv(f))
records[0]









    Out[29]:





{'col1': 'hello', 'col2': 'world'}

Now we're ready to write the records data to file



In [30]:

    
# Create a csv file like object
f_out = cv.records2csv(records)
f_out.readline().strip()









    Out[30]:





'col1,col2'



In [31]:

    
# Create a json file like object
f_out = cv.records2json(records)
f_out.readline()









    Out[31]:





'[{"col1": "hello", "col2": "world"}]'

Interoperability

meza plays nicely with native and NumPy arrays out of the box



In [32]:

    
# First create some records and types. Also, convert the records to a list
# so we can reuse them.
records = [{'a': 'one', 'b': 2}, {'a': 'five', 'b': 10, 'c': 20.1}]
records, result = pr.detect_types(records)
records, types = list(records), result['types']
types









    Out[32]:





[{'id': 'b', 'type': 'int'},
 {'id': 'c', 'type': 'float'},
 {'id': 'a', 'type': 'text'}]



In [33]:

    
# Convert records to a list of arrays
narray = cv.records2array(records, types, native=True)
narray









    Out[33]:





[[array('u', 'b'), array('u', 'c'), array('u', 'a')],
 array('i', [2, 10]),
 array('f', [0.0, 20.100000381469727]),
 [array('u', 'one'), array('u', 'five')]]



In [34]:

    
# Convert list of arrays back to records
next(cv.array2records(narray, native=True))









    Out[34]:





{'a': 'one', 'b': 2, 'c': 0.0}



In [35]:

    
# Convert a 2-D array to records
arr = [(1, 4), (2, 5), (3, 6)]
data = [array('i', a) for a in arr]
data









    Out[35]:





[array('i', [1, 4]), array('i', [2, 5]), array('i', [3, 6])]



In [36]:

    
next(cv.array2records(data, native=True))









    Out[36]:





{'column_1': 1, 'column_2': 2, 'column_3': 3}