In [223]:
from pyspark import SparkConf, SparkContext
from collections import OrderedDict
from functools import partial
import re
from numpy import array
from math import sqrt
import numpy as np
In [224]:
partitions = 64
parlog = sc.textFile("/user/milroy/lustre_debug.out", partitions)
In [225]:
parlog.take(10)
Out[225]:
In [326]:
words = parlog.filter(lambda line: line.count('-') > 3).filter(
lambda line: 'updating' in line).map(lambda line: re.split('\W+', line.lower().strip()))
In [ ]:
words.take(3)
In [329]:
exports = words.flatMap(lambda line: [[int(line[7]), str(x), str(''.join(line[15:19]))]
for x in line if x.startswith('ffff')])
In [330]:
exports.take(1)
Out[330]:
In [331]:
to_int = exports.map(lambda row: [row[0], int(row[1], 16), int(row[2], 16)])
In [332]:
to_int.take(10)
Out[332]:
In [333]:
to_vector = to_int.map(lambda row: np.asarray([float(x) for x in row]))
In [342]:
to_vector.take(2)
Out[342]:
In [344]:
from pyspark.mllib.stat import Statistics
In [347]:
pearsonCorr = Statistics.corr(to_vector, method="spearman")
In [348]:
pearsonCorr
Out[348]: