In [88]:
import numpy as np
data = np.arange(1000).reshape(100,10)
print data.shape
In [89]:
import pandas as pd
pand_tmp = pd.DataFrame(data,
columns=['x{0}'.format(i) for i in range(data.shape[1])])
pand_tmp.head()
Out[89]:
What is the row sum?
In [90]:
pand_tmp.sum(axis=1)
Out[90]:
Column sum?
In [91]:
pand_tmp.sum(axis=0)
Out[91]:
In [92]:
pand_tmp.to_csv('numbers.csv', index=False)
In [93]:
lines = sc.textFile('numbers.csv', 18)
for l in lines.take(3):
print l
In [94]:
type(lines.take(1))
Out[94]:
How do we skip the header? How about using find()? What is Boolean value for true with find()?
In [95]:
lines = lines.filter(lambda x: x.find('x') != 0)
for l in lines.take(2):
print l
In [96]:
data = lines.map(lambda x: x.split(','))
data.take(3)
Out[96]:
Cast to integer and sum!
In [97]:
def row_sum(x):
int_x = map(lambda x: int(x), x)
return sum(int_x)
data_row_sum = data.map(row_sum)
print data_row_sum.collect()
print data_row_sum.count()
This one's a bit trickier, and portends ill for large, complex data sets (like example 4)... Let's enumerate the list comprising each RDD "line" such that each value is indexed by the corresponding column number.
In [98]:
def col_key(x):
for i, value in enumerate(x):
yield (i, int(value))
tmp = data.flatMap(col_key)
tmp.take(12)
Out[98]:
In [99]:
tmp.take(3)
Out[99]:
In [100]:
tmp = tmp.groupByKey()
for i in tmp.take(2):
print i, type(i)
In [101]:
data_col_sum = tmp.map(lambda x: sum(x[1]))
for i in data_col_sum.take(2):
print i
In [102]:
print data_col_sum.collect()
print data_col_sum.count()
In [103]:
from pyspark.sql.types import *
In [104]:
pyspark_df = sqlCtx.createDataFrame(pand_tmp)
In [105]:
pyspark_df.take(2)
Out[105]:
In [106]:
for i in pyspark_df.columns:
print pyspark_df.groupBy().sum(i).collect()