In [9]:
from __future__ import print_function;
import sys;
import findspark
findspark.init('/home/imran/spark')


from pyspark.sql import SparkSession;
from pyspark.sql.types import StructType, StructField, StringType, IntegerType;
from pyspark.sql.functions import regexp_replace, ltrim, rtrim;

filepath = "/home/imran/py/PepoleEDA/data/adult.data";

spark = SparkSession.builder.appName("DataFramePivotEx").getOrCreate();

strct = StructType([StructField("age", IntegerType(), True), 
	StructField("workclass", StringType(), True),
	StructField("finalweight", StringType(), True),
	StructField("education", StringType(), True),
	StructField("educationnum", StringType(), True),
	StructField("maritalstatus", StringType(), True),
	StructField("occupation", StringType(), True),
	StructField("relationship", StringType(), True),
	StructField("race", StringType(), True),
	StructField("gender", StringType(), True),
	StructField("capitalgain", StringType(), True),
	StructField("capitalloss", StringType(), True),
	StructField("hoursperweek", StringType(), True),
	StructField("country", StringType(), True),
	StructField("salary", StringType(), True)]);

df = spark.read.csv(filepath, schema=strct);

dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, 
                     df.occupation, df.relationship, df.race , df.gender, df.capitalgain, df.capitalloss
                     , df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary'));

dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum
                          , dfregex1.maritalstatus, dfregex1.occupation, dfregex1.relationship, dfregex1.race 
                          , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek
                          , dfregex1.country, regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary'));
df =dfregex;

df = dfregex.na.fill({'salary':'0'});
df = df.select(df.workclass, df.education, df.occupation
               , (ltrim(rtrim(df.hoursperweek))).cast(IntegerType()).alias("hoursperweek") 
               , (rtrim(ltrim(df.salary))).cast(IntegerType()).alias("salary") ).na.drop().distinct();
df = df.filter(ltrim(rtrim(df.workclass)) != "?" )
df = df.filter(ltrim(rtrim(df.workclass)) != "Without-pay") 
df = df.filter(ltrim(rtrim(df.workclass)) != "Never-worked") 


df.groupBy("education","occupation").pivot("workclass").sum("salary").show()


+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  education|        occupation| Federal-gov| Local-gov| Private| Self-emp-inc| Self-emp-not-inc| State-gov|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  Assoc-voc|      Adm-clerical|         151|       450|    1409|           50|               50|       150|
|       11th|      Adm-clerical|         150|       200|     901|         null|              100|       100|
|    Masters|  Transport-moving|          50|      null|     304|           50|             null|        50|
|  Assoc-voc| Machine-op-inspct|        null|      null|     654|           51|               50|        51|
|    7th-8th|             Sales|        null|      null|     552|           50|              402|      null|
|    5th-6th| Handlers-cleaners|        null|       101|     501|         null|               50|      null|
|    Masters|      Adm-clerical|         202|       252|     756|           50|             null|       352|
|    Masters|      Armed-Forces|          51|      null|    null|         null|             null|      null|
|  Bachelors|  Transport-moving|         100|       100|     857|         null|              301|      null|
|  Assoc-voc|   Farming-fishing|          50|      null|     452|          101|              904|      null|
|    7th-8th|   Priv-house-serv|        null|      null|     350|         null|             null|      null|
| Assoc-acdm| Handlers-cleaners|        null|        50|     702|         null|             null|      null|
|    1st-4th|     Other-service|        null|       150|     650|         null|              100|        50|
|       10th|     Other-service|         100|       250|    1701|          100|              250|       150|
|  Bachelors| Machine-op-inspct|        null|        50|     905|          152|              101|       100|
|       11th|     Other-service|          50|       350|    1852|         null|              251|       251|
|       12th|   Priv-house-serv|        null|      null|     100|         null|             null|      null|
| Assoc-acdm|  Transport-moving|          50|        50|     501|         null|              252|        50|
|  Bachelors|    Prof-specialty|         858|      2314|    3886|          607|             1465|      1662|
|    5th-6th|   Exec-managerial|        null|      null|    null|         null|               51|      null|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
only showing top 20 rows


In [2]:
df.groupBy("education","occupation").pivot("workclass").sum("salary").show()


+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  education|        occupation| Federal-gov| Local-gov| Private| Self-emp-inc| Self-emp-not-inc| State-gov|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  Assoc-voc|      Adm-clerical|         151|       450|    1409|           50|               50|       150|
|       11th|      Adm-clerical|         150|       200|     901|         null|              100|       100|
|    Masters|  Transport-moving|          50|      null|     304|           50|             null|        50|
|  Assoc-voc| Machine-op-inspct|        null|      null|     654|           51|               50|        51|
|    7th-8th|             Sales|        null|      null|     552|           50|              402|      null|
|    5th-6th| Handlers-cleaners|        null|       101|     501|         null|               50|      null|
|    Masters|      Adm-clerical|         202|       252|     756|           50|             null|       352|
|    Masters|      Armed-Forces|          51|      null|    null|         null|             null|      null|
|  Bachelors|  Transport-moving|         100|       100|     857|         null|              301|      null|
|  Assoc-voc|   Farming-fishing|          50|      null|     452|          101|              904|      null|
|    7th-8th|   Priv-house-serv|        null|      null|     350|         null|             null|      null|
| Assoc-acdm| Handlers-cleaners|        null|        50|     702|         null|             null|      null|
|    1st-4th|     Other-service|        null|       150|     650|         null|              100|        50|
|       10th|     Other-service|         100|       250|    1701|          100|              250|       150|
|  Bachelors| Machine-op-inspct|        null|        50|     905|          152|              101|       100|
|       11th|     Other-service|          50|       350|    1852|         null|              251|       251|
|       12th|   Priv-house-serv|        null|      null|     100|         null|             null|      null|
| Assoc-acdm|  Transport-moving|          50|        50|     501|         null|              252|        50|
|  Bachelors|    Prof-specialty|         858|      2314|    3886|          607|             1465|      1662|
|    5th-6th|   Exec-managerial|        null|      null|    null|         null|               51|      null|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
only showing top 20 rows


In [5]:
df.crosstab("education","salary").show()


+----------------+---+---+
|education_salary| 50| 51|
+----------------+---+---+
|         HS-grad|899|385|
|             9th|154| 24|
|       Assoc-voc|293|138|
|            10th|243| 35|
|         7th-8th|202| 28|
|      Assoc-acdm|282|119|
|    Some-college|812|355|
|         1st-4th| 77|  5|
|            11th|300| 40|
|       Preschool| 29|  0|
|         5th-6th|102| 12|
|       Doctorate| 57|100|
|            12th|150| 20|
|     Prof-school| 77|122|
|       Bachelors|543|400|
|         Masters|237|206|
+----------------+---+---+


In [6]:
df.crosstab("occupation","salary").show()


+------------------+---+---+
| occupation_salary| 50| 51|
+------------------+---+---+
|      Armed-Forces|  8|  1|
|             Sales|457|235|
|   Exec-managerial|451|396|
|      Craft-repair|437|200|
|   Protective-serv|167| 96|
|   Farming-fishing|316| 79|
|    Prof-specialty|538|398|
| Machine-op-inspct|231| 70|
|      Tech-support|186|108|
|      Adm-clerical|458|163|
| Handlers-cleaners|250| 41|
|  Transport-moving|298|119|
|     Other-service|578| 82|
|   Priv-house-serv| 82|  1|
+------------------+---+---+


In [8]:
df.groupBy("education","occupation").pivot("workclass").sum("salary").show()


+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  education|        occupation| Federal-gov| Local-gov| Private| Self-emp-inc| Self-emp-not-inc| State-gov|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
|  Assoc-voc|      Adm-clerical|         151|       450|    1409|           50|               50|       150|
|       11th|      Adm-clerical|         150|       200|     901|         null|              100|       100|
|    Masters|  Transport-moving|          50|      null|     304|           50|             null|        50|
|  Assoc-voc| Machine-op-inspct|        null|      null|     654|           51|               50|        51|
|    7th-8th|             Sales|        null|      null|     552|           50|              402|      null|
|    5th-6th| Handlers-cleaners|        null|       101|     501|         null|               50|      null|
|    Masters|      Adm-clerical|         202|       252|     756|           50|             null|       352|
|    Masters|      Armed-Forces|          51|      null|    null|         null|             null|      null|
|  Bachelors|  Transport-moving|         100|       100|     857|         null|              301|      null|
|  Assoc-voc|   Farming-fishing|          50|      null|     452|          101|              904|      null|
|    7th-8th|   Priv-house-serv|        null|      null|     350|         null|             null|      null|
| Assoc-acdm| Handlers-cleaners|        null|        50|     702|         null|             null|      null|
|    1st-4th|     Other-service|        null|       150|     650|         null|              100|        50|
|       10th|     Other-service|         100|       250|    1701|          100|              250|       150|
|  Bachelors| Machine-op-inspct|        null|        50|     905|          152|              101|       100|
|       11th|     Other-service|          50|       350|    1852|         null|              251|       251|
|       12th|   Priv-house-serv|        null|      null|     100|         null|             null|      null|
| Assoc-acdm|  Transport-moving|          50|        50|     501|         null|              252|        50|
|  Bachelors|    Prof-specialty|         858|      2314|    3886|          607|             1465|      1662|
|    5th-6th|   Exec-managerial|        null|      null|    null|         null|               51|      null|
+-----------+------------------+------------+----------+--------+-------------+-----------------+----------+
only showing top 20 rows


In [ ]:


In [ ]: