In [2]:
import findspark
findspark.init()
import pyspark
import random
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000
def inside(p):
x, y = random.random(), random.random()
return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)
sc.stop()
In [5]:
# from sklearn.datasets.samples_generator import make_blobs
import scipy.io
import csv
import os
import numpy as np
import random as rn
import operator
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import math as m
import matplotlib.ticker as ticker
In [9]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline
# from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import GaussianMixture
# from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionMode
In [6]:
#Reference:
# https://support.10xgenomics.com/single-cell/software/pipelines/latest/output/matrices
# On how to process single cell output matrice on python
genome = "hg19"
matrices_dir = "../PCA/filtered_gene_bc_matrices"
human_matrix_dir = os.path.join(matrices_dir, genome)
mat = scipy.io.mmread(os.path.join(human_matrix_dir, "matrix.mtx"))
genes_path = os.path.join(human_matrix_dir, "genes.tsv")
gene_ids = np.array([row[0] for row in csv.reader(open(genes_path), delimiter="\t")])
gene_names = np.array([row[1] for row in csv.reader(open(genes_path), delimiter="\t")])
gene_ids_names = np.matrix([row for row in csv.reader(open(genes_path), delimiter="\t")])
barcodes_path = os.path.join(human_matrix_dir, "barcodes.tsv")
barcodes = np.array([row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")])