In [59]:
import warnings
warnings.filterwarnings('ignore')

import sys
import random
import numpy as np
import csv
from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
get_ipython().magic(u'matplotlib inline')

In [60]:
def read_tsv( cols, col_types=None):
  fhandle = open('/root/temp/000000_0', 'r')
  pieces = []
  pieces.append(pd.read_csv(fhandle, names=cols, dtype=col_types,header=None, delimiter="\t"))
  fhandle.close()
  return pd.concat(pieces, ignore_index=True)

In [61]:
cols = ['time', 'ip', 'country', 'status'];
pHandle = read_tsv( cols)

pHandle.shape

df = pHandle[pHandle['country']=='US'].dropna(subset=['status'])
df

grouped = pHandle.groupby('country').count()
grouped.plot(kind='bar')


Out[61]:
<matplotlib.axes._subplots.AxesSubplot at 0x5442410>

In [62]:
%%writefile preprocess1.pig
DEFINE preprocess( ) returns data {
    
    tmp = LOAD '/root/data_1' using PigStorage(',')
    as ( time: chararray, ip: chararray, country: chararray, status: chararray);
    $data = filter tmp by country=='US';
        };
    data = preprocess();
    STORE data INTO '/root/output';
    copyToLocal /root/output /root/output)


Overwriting preprocess1.pig

In [63]:
%%bash --err pig_out --bg 
pig -f preprocess1.pig


Starting job # 8 in a separate thread.

In [64]:
while True:
    line = pig_out.readline()
    if not line: 
        break
    sys.stdout.write("%s" % line)
    sys.stdout.flush()


WARNING: Use "yarn jar" to launch YARN applications.
15/10/10 20:47:06 INFO pig.ExecTypeProvider: Trying ExecType : LOCAL
15/10/10 20:47:06 INFO pig.ExecTypeProvider: Trying ExecType : MAPREDUCE
15/10/10 20:47:06 INFO pig.ExecTypeProvider: Picked MAPREDUCE as the ExecType
2015-10-10 20:47:06,523 [main] INFO  org.apache.pig.Main - Apache Pig version 0.15.0.2.3.0.0-2557 (rexported) compiled Jul 14 2015, 10:10:23
2015-10-10 20:47:06,523 [main] INFO  org.apache.pig.Main - Logging error messages to: /usr/hdp/2.3.0.0-2557/pig/pig_1444510026521.log
2015-10-10 20:47:07,871 [main] INFO  org.apache.pig.impl.util.Utils - Default bootup file /root/.pigbootup not found
2015-10-10 20:47:08,093 [main] INFO  org.apache.pig.backend.hadoop.executionengine.HExecutionEngine - Connecting to hadoop file system at: hdfs://sandbox.hortonworks.com:8020
2015-10-10 20:47:09,958 [main] INFO  org.apache.pig.tools.pigstats.ScriptState - Pig features used in the script: FILTER
2015-10-10 20:47:10,020 [main] INFO  org.apache.pig.data.SchemaTupleBackend - Key [pig.schematuple] was not set... will not generate code.
2015-10-10 20:47:10,073 [main] INFO  org.apache.pig.newplan.logical.optimizer.LogicalPlanOptimizer - {RULES_ENABLED=[AddForEach, ColumnMapKeyPrune, ConstantCalculator, GroupByConstParallelSetter, LimitOptimizer, LoadTypeCastInserter, MergeFilter, MergeForEach, PartitionFilterOptimizer, PredicatePushdownOptimizer, PushDownForEachFlatten, PushUpFilter, SplitFilter, StreamTypeCastInserter]}
2015-10-10 20:47:10,284 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRCompiler - File concatenation threshold: 100 optimistic? false
2015-10-10 20:47:10,333 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size before optimization: 1
2015-10-10 20:47:10,333 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MultiQueryOptimizer - MR plan size after optimization: 1
2015-10-10 20:47:11,135 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:11,375 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:11,736 [main] INFO  org.apache.pig.tools.pigstats.mapreduce.MRScriptState - Pig script settings are added to the job
2015-10-10 20:47:11,747 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - mapred.job.reduce.markreset.buffer.percent is not set, set to default 0.3
2015-10-10 20:47:11,753 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - This job cannot be converted run in-process
2015-10-10 20:47:12,346 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Added jar file:/usr/hdp/2.3.0.0-2557/pig/pig-0.15.0.2.3.0.0-2557-core-h2.jar to DistributedCache through /tmp/temp-185899652/tmp-1337514546/pig-0.15.0.2.3.0.0-2557-core-h2.jar
2015-10-10 20:47:12,387 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Added jar file:/usr/hdp/2.3.0.0-2557/pig/lib/automaton-1.11-8.jar to DistributedCache through /tmp/temp-185899652/tmp1758217466/automaton-1.11-8.jar
2015-10-10 20:47:12,447 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Added jar file:/usr/hdp/2.3.0.0-2557/pig/lib/antlr-runtime-3.4.jar to DistributedCache through /tmp/temp-185899652/tmp-255607268/antlr-runtime-3.4.jar
2015-10-10 20:47:12,519 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Added jar file:/usr/hdp/2.3.0.0-2557/hadoop-mapreduce/joda-time-2.8.1.jar to DistributedCache through /tmp/temp-185899652/tmp246124004/joda-time-2.8.1.jar
2015-10-10 20:47:12,606 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.JobControlCompiler - Setting up single store job
2015-10-10 20:47:12,623 [main] INFO  org.apache.pig.data.SchemaTupleFrontend - Key [pig.schematuple] is false, will not generate code.
2015-10-10 20:47:12,624 [main] INFO  org.apache.pig.data.SchemaTupleFrontend - Starting process to move generated code to distributed cacche
2015-10-10 20:47:12,625 [main] INFO  org.apache.pig.data.SchemaTupleFrontend - Setting key [pig.schematuple.classes] with classes to deserialize []
2015-10-10 20:47:12,739 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 1 map-reduce job(s) waiting for submission.
2015-10-10 20:47:13,066 [JobControl] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:13,069 [JobControl] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:13,233 [JobControl] WARN  org.apache.hadoop.mapreduce.JobResourceUploader - No job jar file set.  User classes may not be found. See Job or Job#setJar(String).
2015-10-10 20:47:13,414 [JobControl] INFO  org.apache.hadoop.mapreduce.lib.input.FileInputFormat - Total input paths to process : 1
2015-10-10 20:47:13,415 [JobControl] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths to process : 1
2015-10-10 20:47:13,432 [JobControl] INFO  org.apache.pig.backend.hadoop.executionengine.util.MapRedUtil - Total input paths (combined) to process : 1
2015-10-10 20:47:13,505 [JobControl] INFO  org.apache.hadoop.mapreduce.JobSubmitter - number of splits:1
2015-10-10 20:47:13,981 [JobControl] INFO  org.apache.hadoop.mapreduce.JobSubmitter - Submitting tokens for job: job_1444504078024_0012
2015-10-10 20:47:14,228 [JobControl] INFO  org.apache.hadoop.mapred.YARNRunner - Job jar is not present. Not adding any jar to the list of resources.
2015-10-10 20:47:14,403 [JobControl] INFO  org.apache.hadoop.yarn.client.api.impl.YarnClientImpl - Submitted application application_1444504078024_0012
2015-10-10 20:47:14,488 [JobControl] INFO  org.apache.hadoop.mapreduce.Job - The url to track the job: http://sandbox.hortonworks.com:8088/proxy/application_1444504078024_0012/
2015-10-10 20:47:14,491 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - HadoopJobId: job_1444504078024_0012
2015-10-10 20:47:14,491 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Processing aliases data,macro_preprocess_tmp_0
2015-10-10 20:47:14,491 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - detailed locations: M: macro_preprocess_tmp_0[3,10],macro_preprocess_tmp_0[-1,-1],data[5,11] C:  R: 
2015-10-10 20:47:14,535 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 0% complete
2015-10-10 20:47:14,535 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Running jobs are [job_1444504078024_0012]
2015-10-10 20:47:31,678 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 50% complete
2015-10-10 20:47:31,678 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Running jobs are [job_1444504078024_0012]
2015-10-10 20:47:34,900 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:34,901 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:34,911 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:35,373 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:35,374 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:35,380 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:35,626 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:35,626 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:35,634 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:35,717 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - 100% complete
2015-10-10 20:47:35,720 [main] INFO  org.apache.pig.tools.pigstats.mapreduce.SimplePigStats - Script Statistics: 

HadoopVersion	PigVersion	UserId	StartedAt	FinishedAt	Features
2.7.1.2.3.0.0-2557	0.15.0.2.3.0.0-2557	root	2015-10-10 20:47:11	2015-10-10 20:47:35	FILTER

Success!

Job Stats (time in seconds):
JobId	Maps	Reduces	MaxMapTime	MinMapTime	AvgMapTime	MedianMapTime	MaxReduceTime	MinReduceTime	AvgReduceTime	MedianReducetime	Alias	Feature	Outputs
job_1444504078024_0012	1	0	6	6	6	6	0	0	0	0	data,macro_preprocess_tmp_0	MAP_ONLY	/root/output,

Input(s):
Successfully read 4988 records (226342 bytes) from: "/root/data_1"

Output(s):
Successfully stored 2998 records (135778 bytes) in: "/root/output"

Counters:
Total records written : 2998
Total bytes written : 135778
Spillable Memory Manager spill count : 0
Total bags proactively spilled: 0
Total records proactively spilled: 0

Job DAG:
job_1444504078024_0012


2015-10-10 20:47:35,880 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:35,880 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:35,889 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:36,091 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:36,091 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:36,098 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:36,302 [main] INFO  org.apache.hadoop.yarn.client.api.impl.TimelineClientImpl - Timeline service address: http://sandbox.hortonworks.com:8188/ws/v1/timeline/
2015-10-10 20:47:36,302 [main] INFO  org.apache.hadoop.yarn.client.RMProxy - Connecting to ResourceManager at sandbox.hortonworks.com/10.0.2.15:8050
2015-10-10 20:47:36,311 [main] INFO  org.apache.hadoop.mapred.ClientServiceDelegate - Application state is completed. FinalApplicationStatus=SUCCEEDED. Redirecting to job history server
2015-10-10 20:47:36,369 [main] INFO  org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MapReduceLauncher - Success!
2015-10-10 20:47:36,579 [main] ERROR org.apache.pig.tools.grunt.Grunt - ERROR 1000: Error during parsing. Lexical error at line 9, column 42.  Encountered: ")" (41), after : ""
Details at logfile: /usr/hdp/2.3.0.0-2557/pig/pig_1444510026521.log
2015-10-10 20:47:36,609 [main] INFO  org.apache.pig.Main - Pig script completed in 30 seconds and 281 milliseconds (30281 ms)

In [65]:
def read_dataset(cols):
  fhandle = open('/root/output/part-m-00000', 'r')
  pieces = []
  pieces.append(pd.read_csv(fhandle, names=cols, dtype=None,header=None, delimiter="\t"))
  fhandle.close()
  return pd.concat(pieces, ignore_index=True)

In [66]:
cols = ['time', 'ip', 'country', 'status'];
pHandle = read_dataset( cols)

pHandle.shape


Out[66]:
(2998, 4)

In [69]:
# note data is randomly generated so go to /root/output/part-m-00000 
# and select values that exist for the query
values = ['159.128.147.226', '205.244.156.140']
df2 = pHandle[pHandle['ip'].isin(values)]
df2


Out[69]:
time ip country status
10 2015-10-07T15:50:26 159.128.147.226 US SUCCESS
14 2015-10-07T15:50:26 205.244.156.140 US SUCCESS

In [ ]: