SVD Python

In this NoteBook the reader finds code to read a GeoTiff file, single- or multi-band, from HDFS. It reads the GeoTiff as a ByteArray and then stores the GeoTiff in memory using MemFile from RasterIO python package. Then scipy is used to determine the SVD of a matrix multiplication between two phenology products.

With this example the user can load GeoTiffs from HDFS and then explore all the features of Python packages such as rasterio.

Dependencies



In [1]:

    
#Add all dependencies to PYTHON_PATH
import sys
sys.path.append("/usr/lib/spark/python")
sys.path.append("/usr/lib/spark/python/lib/py4j-0.10.4-src.zip")
sys.path.append("/usr/lib/python3/dist-packages")
sys.path.append("/data/local/jupyterhub/modules/python")

#Define environment variables
import os
os.environ["HADOOP_CONF_DIR"] = "/etc/hadoop/conf"
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "ipython"

import subprocess

#Load PySpark to connect to a Spark cluster
from pyspark import SparkConf, SparkContext
from hdfs import InsecureClient
from tempfile import TemporaryFile

#from osgeo import gdal
#To read GeoTiffs as a ByteArray
from io import BytesIO
from rasterio.io import MemoryFile

import numpy as np
import pandas
import datetime
import matplotlib.pyplot as plt
import rasterio
from rasterio import plot
from os import listdir
from os.path import isfile, join
from numpy import exp, log
from numpy.random import standard_normal
from scipy.linalg import norm, qr, svd
from lowrankproduct import lowrankproduct
from sklearn.utils.extmath import randomized_svd

Connect to Spark



In [11]:

    
appName = "plot_GeoTiff"
masterURL="spark://pheno0.phenovari-utwente.surf-hosted.nl:7077"

#A context needs to be created if it does not already exist
try:
    sc.stop()
except NameError:
    print("A new Spark Context will be created.")

sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
conf = sc.getConf()









    



A new Spark Context will be created.






    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-11-410d8019d0a8> in <module>()
      8     print("A new Spark Context will be created.")
      9 
---> 10 sc = SparkContext(conf = SparkConf().setAppName(appName).setMaster(masterURL))
     11 conf = sc.getConf()

/usr/lib/spark/python/pyspark/context.py in __init__(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, gateway, jsc, profiler_cls)
    113         """
    114         self._callsite = first_spark_call() or CallSite(None, None, None)
--> 115         SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
    116         try:
    117             self._do_init(master, appName, sparkHome, pyFiles, environment, batchSize, serializer,

/usr/lib/spark/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
    257         with SparkContext._lock:
    258             if not SparkContext._gateway:
--> 259                 SparkContext._gateway = gateway or launch_gateway(conf)
    260                 SparkContext._jvm = SparkContext._gateway.jvm
    261 

/usr/lib/spark/python/pyspark/java_gateway.py in launch_gateway(conf)
     93                 callback_socket.close()
     94         if gateway_port is None:
---> 95             raise Exception("Java gateway process exited before sending the driver its port number")
     96 
     97         # In Windows, ensure the Java child processes do not linger after Python has exited.

Exception: Java gateway process exited before sending the driver its port number

Configuration



In [2]:

    
debugMode = True

Support functions



In [3]:

    
def dprint(msg):
    if (debugMode):
        print(msg)



In [4]:

    
def get_hdfs_client():
    return InsecureClient("pheno0.phenovari-utwente.surf-hosted.nl:50070", user="pheno",
         root="/")

Read GeoTiffs



In [5]:

    
def getDataSet(directoryPath):
    dprint("-------------------------------")
    dprint("Running getDataSet(directoryPath)")
    dprint("Start time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")
    dprint("directoryPath: " + directoryPath)
    dprint("-------------------------------")
    files = sc.binaryFiles(directoryPath)
    fileList = files.keys().collect()
    dprint("Found files: " + str(fileList))
    dataArray = []
    for f in fileList:
        data = files.lookup(f)
        dataByteArray = bytearray(data[0])
        memfile = MemoryFile(dataByteArray)
        dataset = memfile.open()
        #relevantBand = np.uint8(dataset.read()[0])
        relevantBand = np.array(dataset.read()[0])
        memfile.close()
        dprint("relevantBand.shape: " + str(relevantBand.shape))
        flattenedDataSet = relevantBand.flatten()
        dprint("flattenedDataSet.shape: " + str(flattenedDataSet.shape))
        dataArray.append(flattenedDataSet)
    #Pandas appends a vectors as a column to a DataFrame
    dataSet = pandas.DataFrame(dataArray).T
    maxDimension = max(dataSet.shape)
    minDimension = min(dataSet.shape)
    dataSetWithIndex = dataSet.reset_index()
    dataSetWithoutNan = dataSetWithIndex.dropna(axis = 0, thresh = minDimension)
    dataSetIndex = dataSetWithoutNan.index
    dataSetWithoutIndex = np.array(dataSetWithoutNan.drop("index", axis = 1))
    dprint("-------------------------------")
    dprint("End time: " + str(datetime.datetime.now()))
    dprint("Ending getDataSet(directoryPath)")
    dprint("-------------------------------")
    return dataSetWithoutIndex, dataSetIndex, maxDimension



In [6]:

    
def normDifferenceUpToSign(vector1, vector2): # Necesarry because algorithm sometimes gives back the negative of the expected result
    normDifference = norm(vector1 - vector2)
    if normDifference > 1:
            normDifference = norm(vector1 + vector2)
    return normDifference



In [7]:

    
def writeMode(resultDir, fileName, i, U, s, V): 
    inFile = "/tmp/" + fileName
    outFile = resultDir + fileName
    
    decompositionFile = open(inFile, "w")
    U.T[i].tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    decompositionFile = open(inFile, "a")
    decompositionFile.write("\n")
    s[i].tofile(decompositionFile, sep = ",")
    decompositionFile.write("\n")
    V.T[i].tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    
    #Upload to HDFS
    subprocess.run(['hadoop', 'dfs', '-copyFromLocal', '-f', inFile, outFile])  

    #Remove from /tmp/
    subprocess.run(['rm', '-fr', inFile])



In [8]:

    
def writeCSV(resultDir, fileName, res):
    inFile = "/tmp/" + fileName
    outFile = resultDir + fileName
    
    decompositionFile = open(inFile, "w")
    res.T.tofile(decompositionFile, sep = ",")
    decompositionFile.close()
    
    #Upload to HDFS
    subprocess.run(['hadoop', 'dfs', '-copyFromLocal', '-f', inFile, outFile])  

    #Remove from /tmp/
    subprocess.run(['rm', '-fr', inFile])



In [9]:

    
def runTest(dataDirectory1, dataDirectory2, resultDir):
    dprint("-------------------------------")
    dprint("Running test")
    dprint("Start time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")

    dataSet1, dataSetIndex1, maxDimension1 = getDataSet(dataDirectory1)
    dataSet2, dataSetIndex2, maxDimension2 = getDataSet(dataDirectory2)
    dprint("dataSet1.shape: " + str(dataSet1.shape))
    dprint("dataSet2.shape: " + str(dataSet2.shape))
    maxDimension = max(max(dataSet1.shape), max(dataSet2.shape))
    minDimension = min(min(dataSet1.shape), min(dataSet2.shape))
    doFullSVD = maxDimension <= 33000
    lowRankQ1, lowRankQ2, lowRankProduct = lowrankproduct(dataSet1, dataSet2, p = 0, i = 2, ifgram = False, iffast = True)
    lowRankU, lowRankS, lowRankVt = svd(lowRankProduct, full_matrices = False)
    lowRankV = lowRankVt.T
    lowRankU2 = lowRankQ1 @ lowRankU
    lowRankV2 = lowRankQ2 @ lowRankV
    new_index1 = pandas.Index(range(maxDimension1), name = "index")
    new_index2 = pandas.Index(range(maxDimension2), name = "index")
    lowRankU3 = np.array(pandas.DataFrame(lowRankU2).reindex(dataSetIndex1).reindex(new_index1))
    lowRankV3 = np.array(pandas.DataFrame(lowRankV2).reindex(dataSetIndex2).reindex(new_index2))
    dprint("lowRankU.shape: " + str(lowRankU.shape))
    dprint("lowRankS.shape: " + str(lowRankS.shape))
    dprint("lowRankV.shape: " + str(lowRankV.shape))
    dprint("lowRankU2.shape: " + str(lowRankU2.shape))
    dprint("lowRankV2.shape: " + str(lowRankV2.shape))
    dprint("lowRankU3.shape: " + str(lowRankU3.shape))
    dprint("lowRankV3.shape: " + str(lowRankV3.shape))
    dprint("Singular values of low-rank product: ")
    dprint(lowRankS)
    dprint("lowRankU2.T[0][:minDimension]: ")
    dprint(lowRankU2.T[0][:minDimension])
    dprint("lowRankV2.T[0][:minDimension]: ")
    dprint(lowRankV2.T[0][:minDimension])
    if doFullSVD:
        fullProduct = dataSet1 @ dataSet2.T
        dprint("fullProduct shape " + str(fullProduct.shape))
        dprint("ncomponents " + str(minDimension))
        fullU, fullS, fullVt = randomized_svd(fullProduct, n_components = minDimension)
        fullV = fullVt.T
        dprint("fullU.shape: " + str(fullU.shape))
        dprint("fullS.shape: " + str(fullS.shape))
        dprint("fullV.shape: " + str(fullV.shape))
        dprint("Singular values of full product: ")
        dprint(fullS)
        dprint("fullU.T[0][:minDimension]: ")
        dprint(fullU.T[0][:minDimension])
        dprint("fullV.T[0][:minDimension]: ")
        dprint(fullV.T[0][:minDimension])
    for i in range(len(lowRankS)):
        iString = str(i + 1).zfill(2)
        if doFullSVD:
            u = fullU.T[i]
            v = fullV.T[i]
        else:
            u = dataSet1 @ (dataSet2.T @ lowRankV2.T[i]) / lowRankS[i]
            v = dataSet2 @ (dataSet1.T @ lowRankU2.T[i]) / lowRankS[i]
        dprint("Norm difference u" + iString + ": " + str(normDifferenceUpToSign(lowRankU2.T[i], u)))
        dprint("Norm difference v" + iString + ": " + str(normDifferenceUpToSign(lowRankV2.T[i], v)))
        writeMode(resultDir, "ModeWithoutNan" + iString + ".txt", i, lowRankU2, lowRankS, lowRankV2)
        writeMode(resultDir, "ModeWithNan" + iString + ".txt", i, lowRankU3, lowRankS, lowRankV3)
    #lowRankU2.T.tofile(resultDirectory + "/U.csv", sep = ",")
    writeCSV(resultDir, "U.csv", lowRankU2)
    #lowRankS.tofile(resultDirectory + "/s.csv", sep = ",")
    writeCSV(resultDir, "s.csv", lowRankS)
    #lowRankV2.T.tofile(resultDirectory + "/V.csv", sep = ",")
    writeCSV(resultDir, "V.csv", lowRankV2)

    dprint("-------------------------------")
    dprint("Ending test")
    dprint("End time: " + str(datetime.datetime.now()))
    dprint("-------------------------------")

Tests

Test 1



In [10]:

    
print("-------------------------------")
print("Running test 1")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomGridmet/"
dataDirectory2 = "hdfs:///user/hadoop/spring-index/LeafGridmet/"
resultDirectory = "hdfs:///user/pheno/svd/BloomGridmetLeafGridmet/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 1")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 1
Start time: 2017-11-03 12:01:43.049522
-------------------------------
-------------------------------
Running test
Start time: 2017-11-03 12:01:46.405227
-------------------------------
-------------------------------
Running getDataSet(directoryPath)
Start time: 2017-11-03 12:01:46.405997
-------------------------------
directoryPath: hdfs:///user/hadoop/spring-index/BloomGridmet/
-------------------------------






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-30ed339115ce> in <module>()
     11 subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])
     12 
---> 13 runTest(dataDirectory1, dataDirectory2, resultDirectory)
     14 
     15 print("-------------------------------")

<ipython-input-9-ad7849584eeb> in runTest(dataDirectory1, dataDirectory2, resultDir)
      5     dprint("-------------------------------")
      6 
----> 7     dataSet1, dataSetIndex1, maxDimension1 = getDataSet(dataDirectory1)
      8     dataSet2, dataSetIndex2, maxDimension2 = getDataSet(dataDirectory2)
      9     dprint("dataSet1.shape: " + str(dataSet1.shape))

<ipython-input-5-ed9b3458bd05> in getDataSet(directoryPath)
      6     dprint("directoryPath: " + directoryPath)
      7     dprint("-------------------------------")
----> 8     files = sc.binaryFiles(directoryPath)
      9     fileList = files.keys().collect()
     10     dprint("Found files: " + str(fileList))

NameError: name 'sc' is not defined

Test 2



In [63]:

    
print("-------------------------------")
print("Running test 2")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/spring-index/LeafFinalLow/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowLeafFinalLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 2")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 2
Start time: 2017-10-27 13:16:20.092836
-------------------------------
---------------------------
Running lowrankproduct(dataSet1, dataSet2, p, i, ifgram, iffast)
Start time: 2017-10-27 13:16:55.997545
---------------------------
dataSet1.shape: (31089, 26)
dataSet2.shape: (31089, 26)
p: 0
i: 2
ifgram: False
iffast: True
---------------------------
k: 26
l: 26
lowRankQ1.shape: (31089, 26)
lowRankQ2.shape: (31089, 26)
B1.shape: (26, 26)
B2.shape: (26, 26)
lowRankProduct.shape: (26, 26)
-------------------------------
Ending lowrankproductsvd(dataSet1, dataSet2, p, i, ifgram, iffast)
End time: 2017-10-27 13:17:06.911020
-------------------------------
-------------------------------
Ending test 2
End time: 2017-10-27 13:19:24.926910
-------------------------------

Test 3



In [61]:

    
print("-------------------------------")
print("Running test 3")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowSOSTLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 3")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 3
Start time: 2017-10-27 13:06:04.778482
-------------------------------
---------------------------
Running lowrankproduct(dataSet1, dataSet2, p, i, ifgram, iffast)
Start time: 2017-10-27 13:06:41.716855
---------------------------
dataSet1.shape: (31089, 26)
dataSet2.shape: (31524, 26)
p: 0
i: 2
ifgram: False
iffast: True
---------------------------
k: 26
l: 26
lowRankQ1.shape: (31089, 26)
lowRankQ2.shape: (31524, 26)
B1.shape: (26, 26)
B2.shape: (26, 26)
lowRankProduct.shape: (26, 26)
-------------------------------
Ending lowrankproductsvd(dataSet1, dataSet2, p, i, ifgram, iffast)
End time: 2017-10-27 13:06:51.317871
-------------------------------
-------------------------------
Ending test 3
End time: 2017-10-27 13:09:08.199955
-------------------------------

Test 4



In [62]:

    
print("-------------------------------")
print("Running test 4")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/LeafFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"
resultDirectory = "hdfs:///user/pheno/svd/LeafFinalLowSOSTLow/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 4")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 4
Start time: 2017-10-27 13:11:26.834135
-------------------------------
---------------------------
Running lowrankproduct(dataSet1, dataSet2, p, i, ifgram, iffast)
Start time: 2017-10-27 13:12:03.052125
---------------------------
dataSet1.shape: (31089, 26)
dataSet2.shape: (31524, 26)
p: 0
i: 2
ifgram: False
iffast: True
---------------------------
k: 26
l: 26
lowRankQ1.shape: (31089, 26)
lowRankQ2.shape: (31524, 26)
B1.shape: (26, 26)
B2.shape: (26, 26)
lowRankProduct.shape: (26, 26)
-------------------------------
Ending lowrankproductsvd(dataSet1, dataSet2, p, i, ifgram, iffast)
End time: 2017-10-27 13:12:14.168737
-------------------------------
-------------------------------
Ending test 4
End time: 2017-10-27 13:14:30.913878
-------------------------------

Test 5



In [67]:

    
print("-------------------------------")
print("Running test 5")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLowPR/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLowPR/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowPRSOSTLowPR/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 5")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 5
Start time: 2017-10-27 16:46:10.109739
-------------------------------






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-67-a98fecbaf10f> in <module>()
     11 subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])
     12 
---> 13 runTest(dataDirectory1, dataDirectory2, resultDirectory)
     14 
     15 print("-------------------------------")

<ipython-input-47-ad7849584eeb> in runTest(dataDirectory1, dataDirectory2, resultDir)
      5     dprint("-------------------------------")
      6 
----> 7     dataSet1, dataSetIndex1, maxDimension1 = getDataSet(dataDirectory1)
      8     dataSet2, dataSetIndex2, maxDimension2 = getDataSet(dataDirectory2)
      9     dprint("dataSet1.shape: " + str(dataSet1.shape))

<ipython-input-10-ed9b3458bd05> in getDataSet(directoryPath)
      7     dprint("-------------------------------")
      8     files = sc.binaryFiles(directoryPath)
----> 9     fileList = files.keys().collect()
     10     dprint("Found files: " + str(fileList))
     11     dataArray = []

/usr/lib/spark/python/pyspark/rdd.py in collect(self)
    806         """
    807         with SCCallSiteSync(self.context) as css:
--> 808             port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
    809         return list(_load_from_socket(port, self._jrdd_deserializer))
    810 

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1129             proto.END_COMMAND_PART
   1130 
-> 1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
   1133             answer, self.gateway_client, self.target_id, self.name)

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in send_command(self, command, retry, binary)
    881         connection = self._get_connection()
    882         try:
--> 883             response = connection.send_command(command)
    884             if binary:
    885                 return response, self._create_connection_guard(connection)

/usr/lib/spark/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in send_command(self, command)
   1026 
   1027         try:
-> 1028             answer = smart_decode(self.stream.readline()[:-1])
   1029             logger.debug("Answer received: {0}".format(answer))
   1030             if answer.startswith(proto.RETURN_MESSAGE):

/usr/lib/python3.5/socket.py in readinto(self, b)
    573         while True:
    574             try:
--> 575                 return self._sock.recv_into(b)
    576             except timeout:
    577                 self._timeout_occurred = True

KeyboardInterrupt:

Test 6



In [13]:

    
print("-------------------------------")
print("Running test 6")
print("Start time: " + str(datetime.datetime.now()))
print("-------------------------------")

dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinal/"
dataDirectory2 = "hdfs:///user/hadoop/spring-index/LeafFinal/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLeafFinal/"

#Create Result dir
subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])

runTest(dataDirectory1, dataDirectory2, resultDirectory)

print("-------------------------------")
print("Ending test 6")
print("End time: " + str(datetime.datetime.now()))
print("-------------------------------")









    



-------------------------------
Running test 6
Start time: 2017-10-30 12:48:04.816080
-------------------------------
-------------------------------
Running test
Start time: 2017-10-30 12:48:06.428073
-------------------------------
-------------------------------
Running getDataSet(directoryPath)
Start time: 2017-10-30 12:48:06.428889
-------------------------------
directoryPath: hdfs:///user/hadoop/spring-index/BloomFinal/
-------------------------------
Found files: ['hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1980.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1984.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1986.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1981.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1982.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1987.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1983.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1989.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1993.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1995.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1988.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1985.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1992.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1991.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1990.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1997.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1994.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1996.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2000.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1999.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2009.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/1998.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2007.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2002.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2001.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2004.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2003.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2008.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2010.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2006.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2012.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2005.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2015.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2011.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2014.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/BloomFinal/2013.tif']
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
-------------------------------
End time: 2017-10-30 13:18:27.284805
Ending getDataSet(directoryPath)
-------------------------------
-------------------------------
Running getDataSet(directoryPath)
Start time: 2017-10-30 13:18:27.488375
-------------------------------
directoryPath: hdfs:///user/hadoop/spring-index/LeafFinal/
-------------------------------
Found files: ['hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1982.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1980.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1998.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1986.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1981.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1983.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1988.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1985.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1987.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1984.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2001.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1989.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1992.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1994.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1991.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1995.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1990.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1997.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2005.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1996.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1993.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2000.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/1999.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2002.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2008.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2012.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2003.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2004.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2007.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2006.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2009.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2015.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2010.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2013.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2011.tif', 'hdfs://pheno0.phenovari-utwente.surf-hosted.nl:9000/user/hadoop/spring-index/LeafFinal/2014.tif']
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
relevantBand.shape: (3892, 7808)
flattenedDataSet.shape: (30388736,)
-------------------------------
End time: 2017-10-30 13:50:15.884963
Ending getDataSet(directoryPath)
-------------------------------
dataSet1.shape: (11819120, 36)
dataSet2.shape: (11819636, 36)
---------------------------
Running lowrankproduct(dataSet1, dataSet2, p, i, ifgram, iffast)
Start time: 2017-10-30 13:50:15.975175
---------------------------
dataSet1.shape: (11819120, 36)
dataSet2.shape: (11819636, 36)
p: 0
i: 2
ifgram: False
iffast: True
---------------------------
k: 36
l: 36






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-374aec364c88> in <module>()
     11 subprocess.run(['hadoop', 'dfs', '-mkdir', resultDirectory])
     12 
---> 13 runTest(dataDirectory1, dataDirectory2, resultDirectory)
     14 
     15 print("-------------------------------")

<ipython-input-12-ad7849584eeb> in runTest(dataDirectory1, dataDirectory2, resultDir)
     12     minDimension = min(min(dataSet1.shape), min(dataSet2.shape))
     13     doFullSVD = maxDimension <= 33000
---> 14     lowRankQ1, lowRankQ2, lowRankProduct = lowrankproduct(dataSet1, dataSet2, p = 0, i = 2, ifgram = False, iffast = True)
     15     lowRankU, lowRankS, lowRankVt = svd(lowRankProduct, full_matrices = False)
     16     lowRankV = lowRankVt.T

/data/local/jupyterhub/modules/python/lowrankproduct.py in lowrankproduct(dataSet1, dataSet2, p, i, ifgram, iffast)
     45                 l = k + p
     46                 print("l: " + str(l))
---> 47                 lowRankQ1 = iterations(dataSet1, k, i, l, ifgram, iffast)
     48                 lowRankQ2 = iterations(dataSet2, k, i, l, ifgram, iffast)
     49                 print("lowRankQ1.shape: " + str(lowRankQ1.shape))

/data/local/jupyterhub/modules/python/lowrank.py in iterations(A, k, i, l, ifgram, iffast)
     66             Q, _, _ = gram1(Y)
     67         else:
---> 68             Q, _, _ = randtsvd1(Y, iffast)
     69         # Form (or, rather, update) Y = A^* Q.
     70         Y = A.T @ Q

/data/local/jupyterhub/modules/python/randtsvd.py in randtsvd1(A, iffast)
     68         B = Omega @ A.T
     69     # Construct a QR decomposition of B^*.
---> 70     Q, R = tsqr(B.T)
     71     # Calculate the SVD  Utilde S Vtilde^* = R, where S = diag(s).
     72     Utilde, s, Vtildet = svd(R, full_matrices=False)

/data/local/jupyterhub/modules/python/randtsvd.py in tsqr(A)
    270     # Recursively merge the triangular factors in QR decompositions,
    271     # starting with the construction of QR decompositions at the leaves.
--> 272     R = tsqrr(A, t)
    273     # Discard rows of the triangular matrix R for which the diagonal entry
    274     # is numerically zero.

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    258             # via their joint QR decomposition.
    259             Rlow = tsqrr(A, t['blow'])
--> 260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))
    262             _, R = qr(Rlohi, mode='economic')

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    258             # via their joint QR decomposition.
    259             Rlow = tsqrr(A, t['blow'])
--> 260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))
    262             _, R = qr(Rlohi, mode='economic')

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    258             # via their joint QR decomposition.
    259             Rlow = tsqrr(A, t['blow'])
--> 260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))
    262             _, R = qr(Rlohi, mode='economic')

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    258             # via their joint QR decomposition.
    259             Rlow = tsqrr(A, t['blow'])
--> 260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))
    262             _, R = qr(Rlohi, mode='economic')

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    257             # Merge the triangular factors in QR decompositions
    258             # via their joint QR decomposition.
--> 259             Rlow = tsqrr(A, t['blow'])
    260             Rhigh = tsqrr(A, t['bhigh'])
    261             Rlohi = np.vstack((Rlow, Rhigh))

/data/local/jupyterhub/modules/python/randtsvd.py in tsqrr(A, t)
    263         else:
    264             # Being at a leaf in the tree, construct the QR decomp. directly.
--> 265             _, R = qr(A[t['low']:(t['high']+1), :], mode='economic')
    266         return R
    267 

/usr/local/lib/python3.5/dist-packages/scipy/linalg/decomp_qr.py in qr(a, overwrite_a, lwork, mode, pivoting, check_finite)
    124 
    125     if check_finite:
--> 126         a1 = numpy.asarray_chkfinite(a)
    127     else:
    128         a1 = numpy.asarray(a)

/usr/local/lib/python3.5/dist-packages/numpy/lib/function_base.py in asarray_chkfinite(a, dtype, order)
   1213     if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
   1214         raise ValueError(
-> 1215             "array must not contain infs or NaNs")
   1216     return a
   1217 

ValueError: array must not contain infs or NaNs

SVD test ground



In [5]:

    
dataDirectory1 = "hdfs:///user/hadoop/spring-index/BloomFinalLow/"
dataDirectory2 = "hdfs:///user/hadoop/avhrr/SOSTLow/"
resultDirectory = "hdfs:///user/pheno/svd/BloomFinalLowSOSTLow/"

dataSet1, dataSetIndex1, maxDimension1 = getDataSet(dataDirectory1)
dataSet2, dataSetIndex2, maxDimension2 = getDataSet(dataDirectory2)

print("dataSet1.shape: " + str(dataSet1.shape))
print("dataSet2.shape: " + str(dataSet2.shape))









    



(31524, 26)
(31524, 26)
dataSet1.shape: (31089, 26)
dataSet2.shape: (31524, 26)



In [6]:

    
fullProduct = dataSet1 @ dataSet2.T



In [7]:

    
fullProduct.shape









    Out[7]:





(31089, 31524)



In [8]:

    
minDimension = min(min(dataSet1.shape), min(dataSet2.shape))
randU, randS, randVt = randomized_svd(fullProduct, n_components=minDimension)



In [ ]:

    
#normU, normS, normVt = svd(fullProduct, full_matrices = True)
normU, normS, normVt = svd(fullProduct, full_matrices = False)



In [ ]: