In [2]:
import itertools as it
import sys
import xlrd
from os import path as p, chdir
from io import StringIO
from array import array
if 'examples' in p.abspath('.'):
chdir('..')
from meza import io, process as pr, convert as cv, fntools as ft, stats
In [3]:
# Create a csv file
text = (
'col1,col2,col3\n'
'hello,5/4/82,1\n'
'one,1/1/15,2\n'
'happy,7/4/92,3\n')
f = StringIO(text)
f.seek(0)
# Load the file
records = io.read_csv(f)
# Records are an iterator over the rows
row = next(records)
row
Out[3]:
In [4]:
# Replace first row so as not to loose any data
records = pr.prepend(records, row)
# Guess column types. Note: `detect_types` returns a new `records`
# generator since it consumes rows during type detection
records, result = pr.detect_types(records)
result['types']
Out[4]:
In [5]:
# Now type cast the records. Note: most `meza.process` functions return
# generators, so lets wrap the result in a list to view the data
casted = list(pr.type_cast(records, **result))
casted[0]
Out[5]:
In [6]:
# Cut out the first column of data
cut_recs = pr.cut(casted, ['col1'], exclude=True)
cut_recs
Out[6]:
In [7]:
# merge the cut rows to and get the max value of the remaining columns.
# Note: since `merge` (by definition) will always contain just one row,
# it is returned as is (not wrapped in a generator)
merged = pr.merge(cut_recs, pred=bool, op=max)
merged
Out[7]:
In [8]:
# Now write merged data back to a new csv file.
f = StringIO()
io.write(f, cv.records2csv([merged]))
f.getvalue()
Out[8]:
meza can read both filepaths and file-like objects. Additionally, all readers return equivalent records iterators, i.e., a generator of dictionaries with keys corresponding to the column names. Please see Readers for a complete list of available readers and recognized file types.
In [9]:
# Read a file like object and de-duplicate the header
f = StringIO('col,col\nhello,world\n')
records = io.read_csv(f, dedupe=True)
# View the first row
next(records)
Out[9]:
In the following example, pandas equivalent methods are preceded by -->
.
In [10]:
# Create some data in the same structure as what the various `read...`
# functions output
header = ['A', 'B', 'C', 'D']
data = [
[0.5607, 0.9338, 0.4769, 0.7804],
[0.8227, 0.2844, 0.8166, 0.7323],
[0.4627, 0.8633, 0.3283, 0.1909],
[0.3932, 0.5454, 0.9604, 0.6376],
[0.3685, 0.9166, 0.9457, 0.8066],
[0.7584, 0.6981, 0.5625, 0.3578],
[0.8959, 0.6932, 0.2565, 0.3378]]
df = [dict(zip(header, d)) for d in data]
df[0]
Out[10]:
In [11]:
# Sort records by the value of column `B` --> df.sort_values(by='B')
next(pr.sort(df, 'B'))
Out[11]:
In [12]:
# Select column `A` --> df['A']
next(pr.cut(df, ['A']))
Out[12]:
In [13]:
# Select the first 3 rows of data --> df[0:3]
list(it.islice(df, 3))
Out[13]:
In [14]:
# Select all data whose value for column `A` is less than 0.5 --> df[df.A < 0.5]
next(pr.tfilter(df, 'A', lambda x: x < 0.5))
Out[14]:
Note: since aggregate
and merge
(by definition) return just one row,
they return them as is (not wrapped in a generator).
In [15]:
# Calculate the mean of column `A` across all data --> df.mean()['A']
pr.aggregate(df, 'A', stats.mean)['A']
Out[15]:
In [16]:
# Calculate the sum of each column across all data --> df.sum()
pr.merge(df, pred=bool, op=sum)
Out[16]:
In the following example, csvkit equivalent commands are preceded by -->
.
In [17]:
# First create a few simple csv files
f1 = StringIO('col_1,col_2,col_3\n1,dill,male\n2,bob,male\n3,jane,female\n')
f2 = StringIO('col_1,col_2,col_3\n4,tom,male\n5,dick,male\n6,jill,female\n')
f1.seek(0)
f2.seek(0)
Out[17]:
Now we can read the files, manipulate the data, convert the manipulated data to json, and write the json back to a new file. Also, note that since all readers return equivalent records iterators, you can use them interchangeably (in place of read_csv) to open any supported file. E.g., read_xls, read_sqlite, etc.
In [18]:
# First, let's combine the files into one iterator --> csvstack file1.csv file2.csv
records = io.join(f1, f2, ext='csv')
next(records)
Out[18]:
In [19]:
next(it.islice(records, 4, None))
Out[19]:
In [20]:
# Now let's create a persistant records list so we can do some simple manipulations
f1.seek(0)
records = list(io.read_csv(f1))
In [21]:
# Sort records by the value of column `col_2` --> csvsort -c col_2 file1.csv
next(pr.sort(records, 'col_2'))
Out[21]:
In [22]:
# Select column `col_2` --> csvcut -c col_2 file1.csv
next(pr.cut(records, ['col_2']))
Out[22]:
In [23]:
# Select all data whose value for column `col_2` contains `jan`
# --> csvgrep -c col_2 -m jan file1.csv
next(pr.grep(records, [{'pattern': 'jan'}], ['col_2']))
Out[23]:
In [24]:
# Convert a csv file to json --> csvjson -i 4 file1.csv
f_json = StringIO()
io.write(f_json, cv.records2json(records))
f_json.getvalue()
Out[24]:
In the following example, mapbox equivalent commands are preceded by -->
.
In [25]:
# First create a geojson file
f = StringIO(
'{"type": "FeatureCollection","features": ['
'{"type": "Feature", "id": 11, "geometry": '
'{"type": "Point", "coordinates": [10, 20]}},'
'{"type": "Feature", "id": 12, "geometry": '
'{"type": "Point", "coordinates": [5, 15]}}]}')
f.seek(0)
Out[25]:
Now we can open the file, split the data by id, and finally convert the split data to a new geojson file-like object.
In [26]:
# Load the geojson file and peek at the results
records, peek = pr.peek(io.read_geojson(f))
peek[0]
Out[26]:
In [27]:
# Split the records by feature ``id`` and select the first feature --> geojsplit -k id file.geojson
splits = pr.split(records, 'id')
feature_records, name = next(splits)
name
Out[27]:
In [28]:
# Convert the feature records into a GeoJSON file-like object
geojson = cv.records2geojson(feature_records)
geojson.readline()
Out[28]:
Note: you can also write back to a file as shown previously io.write('file.geojson', geojson)
.
meza can persist records to disk via the following functions:
Each function returns a file-like object that you can write to disk via meza.io.write('/path/to/file', result)
.
In [29]:
# First let's create a simple tsv file like object
f = StringIO('col1\tcol2\nhello\tworld\n')
f.seek(0)
# Next create a records list so we can reuse it
records = list(io.read_tsv(f))
records[0]
Out[29]:
Now we're ready to write the records data to file
In [30]:
# Create a csv file like object
f_out = cv.records2csv(records)
f_out.readline().strip()
Out[30]:
In [31]:
# Create a json file like object
f_out = cv.records2json(records)
f_out.readline()
Out[31]:
meza plays nicely with native and NumPy arrays out of the box
In [32]:
# First create some records and types. Also, convert the records to a list
# so we can reuse them.
records = [{'a': 'one', 'b': 2}, {'a': 'five', 'b': 10, 'c': 20.1}]
records, result = pr.detect_types(records)
records, types = list(records), result['types']
types
Out[32]:
In [33]:
# Convert records to a list of arrays
narray = cv.records2array(records, types, native=True)
narray
Out[33]:
In [34]:
# Convert list of arrays back to records
next(cv.array2records(narray, native=True))
Out[34]:
In [35]:
# Convert a 2-D array to records
arr = [(1, 4), (2, 5), (3, 6)]
data = [array('i', a) for a in arr]
data
Out[35]:
In [36]:
next(cv.array2records(data, native=True))
Out[36]: