In [2]:
import seaborn as sns; sns.set(color_codes=True)
tips = sns.load_dataset("tips")
print(tips[:5])
print(len(tips))


   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
244

In [3]:
ax = sns.regplot(x="total_bill", y="tip", data=tips)



In [4]:
import matplotlib.pyplot as plt
g = sns.FacetGrid(tips, hue="sex", size=6, aspect=2)
g.map(plt.scatter, "total_bill", "tip")
g.add_legend()


Out[4]:
<seaborn.axisgrid.FacetGrid at 0x7f3b9c092990>

In [5]:
import random
import numpy as np
x = np.arange(0,1,0.01)
y = 3*x*x - 2*x + 8
y = y + random.sample(np.arange(-0.2,0.2,0.4/len(y)), len(y))
plt.plot(y, marker='o', linestyle='None')


Out[5]:
[<matplotlib.lines.Line2D at 0x7f3b8df68a10>]

In [6]:
ax = sns.regplot(x, y, ci=None, truncate=True)



In [7]:
ax = sns.regplot(x, y, order=2, ci=None, truncate=True)



In [8]:
ax = sns.regplot(x, y, order=12, ci=None, truncate=True)



In [9]:
import tensorflow as tf
import pandas as pd
import seaborn as sns;
tips = sns.load_dataset("tips")
tips = tips.sample(frac=1.0)
trainsize = int(len(tips) * 0.8)
df_train = tips[:trainsize]
print(df_train[:5])


     total_bill   tip     sex smoker  day    time  size
168       10.59  1.61  Female    Yes  Sat  Dinner     2
7         26.88  3.12    Male     No  Sun  Dinner     4
182       45.35  3.50    Male    Yes  Sun  Dinner     3
6          8.77  2.00    Male     No  Sun  Dinner     2
167       31.71  4.50    Male     No  Sun  Dinner     4

In [10]:
# working with numpy arrays works
tf.logging.set_verbosity(tf.logging.INFO)
predictors = df_train.loc[:,['total_bill', 'size']].values # np.ndarray
targets = df_train.iloc[:,1].values
features = tf.contrib.learn.infer_real_valued_columns_from_input(predictors)
model = tf.contrib.learn.LinearRegressor(feature_columns=features)
model.fit(predictors, targets, steps=1000)


WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmp7RNyVa
WARNING:tensorflow:Setting feature info to TensorSignature(dtype=tf.float32, shape=TensorShape([Dimension(None), Dimension(2)]), is_sparse=False)
WARNING:tensorflow:Setting targets info to TensorSignature(dtype=tf.float64, shape=TensorShape([Dimension(None)]), is_sparse=False)
INFO:tensorflow:Transforming feature_column _RealValuedColumn(column_name='', dimension=2, default_value=None, dtype=tf.float32)
INFO:tensorflow:Create CheckpointSaver
INFO:tensorflow:Step 1: loss = 9.86896
INFO:tensorflow:Step 101: loss = 0.828678
INFO:tensorflow:Step 201: loss = 0.826893
INFO:tensorflow:Saving checkpoints for 300 into /tmp/tmp7RNyVa/model.ckpt.
INFO:tensorflow:Step 301: loss = 0.806317
INFO:tensorflow:Step 401: loss = 0.788582
INFO:tensorflow:Step 501: loss = 0.776667
INFO:tensorflow:Saving checkpoints for 600 into /tmp/tmp7RNyVa/model.ckpt.
INFO:tensorflow:Step 601: loss = 0.769561
INFO:tensorflow:Step 701: loss = 0.765598
INFO:tensorflow:Step 801: loss = 0.763524
INFO:tensorflow:Saving checkpoints for 900 into /tmp/tmp7RNyVa/model.ckpt.
INFO:tensorflow:Step 901: loss = 0.762461
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmp7RNyVa/model.ckpt.
INFO:tensorflow:Loss for final step: 0.761937.
Out[10]:
LinearRegressor()

In [28]:
arr = [[x for x in np.append(np.zeros(9), 1.0)] for row in range(0,10)]
for row in arr:
   np.random.shuffle(row)
print(np.array(arr))


[[ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]]

In [34]:
%bash
pip install scikit-image


Collecting scikit-image
  Downloading scikit_image-0.12.3-1-cp27-cp27mu-manylinux1_x86_64.whl (28.2MB)
Collecting dask[array]>=0.5.0 (from scikit-image)
  Downloading dask-0.11.0-py2.py3-none-any.whl (362kB)
Requirement already satisfied (use --upgrade to upgrade): six>=1.7.3 in /usr/local/lib/python2.7/dist-packages (from scikit-image)
Requirement already satisfied (use --upgrade to upgrade): scipy>=0.9.0 in /usr/local/lib/python2.7/dist-packages (from scikit-image)
Requirement already satisfied (use --upgrade to upgrade): pillow>=2.1.0 in /usr/local/lib/python2.7/dist-packages (from scikit-image)
Requirement already satisfied (use --upgrade to upgrade): matplotlib>=1.3.1 in /usr/local/lib/python2.7/dist-packages (from scikit-image)
Collecting networkx>=1.8 (from scikit-image)
  Downloading networkx-1.11-py2.py3-none-any.whl (1.3MB)
Requirement already satisfied (use --upgrade to upgrade): numpy; extra == "array" in /usr/local/lib/python2.7/dist-packages (from dask[array]>=0.5.0->scikit-image)
Collecting toolz>=0.7.2; extra == "array" (from dask[array]>=0.5.0->scikit-image)
  Downloading toolz-0.8.0.tar.gz (40kB)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.3.1->scikit-image)
Requirement already satisfied (use --upgrade to upgrade): pytz in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.3.1->scikit-image)
Requirement already satisfied (use --upgrade to upgrade): cycler in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.3.1->scikit-image)
Requirement already satisfied (use --upgrade to upgrade): pyparsing!=2.0.4,>=1.5.6 in /usr/local/lib/python2.7/dist-packages (from matplotlib>=1.3.1->scikit-image)
Requirement already satisfied (use --upgrade to upgrade): decorator>=3.4.0 in /usr/local/lib/python2.7/dist-packages (from networkx>=1.8->scikit-image)
Building wheels for collected packages: toolz
  Running setup.py bdist_wheel for toolz: started
  Running setup.py bdist_wheel for toolz: finished with status 'done'
  Stored in directory: /root/.cache/pip/wheels/b0/84/bf/7089262387e8ea60bdefb1fdb84d2ee99427f6d09c9c7ba37d
Successfully built toolz
Installing collected packages: toolz, dask, networkx, scikit-image
Successfully installed dask-0.11.0 networkx-1.11 scikit-image-0.12.3 toolz-0.8.0

In [40]:
import skimage
import skimage.io
import skimage.filters
import os
from skimage.color import rgb2gray

desert = rgb2gray(skimage.io.imread('algodones-dunes-1654439_1920.jpg'))
skimage.io.use_plugin('matplotlib', 'imread')
skimage.io.imshow(desert)


Out[40]:
<matplotlib.image.AxesImage at 0x7f3b84778bd0>

In [59]:
edges = np.abs(skimage.filters.sobel_h(desert))
from skimage.filters.rank import maximum
from skimage.morphology import disk
out = 255 - maximum(edges, disk(5))
skimage.io.imshow(out)


Out[59]:
<matplotlib.image.AxesImage at 0x7f3b5f2b7fd0>

In [60]:
smooth = skimage.filters.rank.mean(desert, disk(2))
edges = np.abs(skimage.filters.sobel_h(smooth))
from skimage.filters.rank import maximum
from skimage.morphology import disk
out = 255 - maximum(edges, disk(5))
skimage.io.imshow(out)


Out[60]:
<matplotlib.image.AxesImage at 0x7f3b5f1c1c50>

Benchmarking early models


In [1]:
import datalab.bigquery as bq
import numpy as np
import pandas as pd


def create_query(phase, EVERY_N):
  """
  phase: 1=train 2=valid
  """
  base_query = """
SELECT
  DAYOFWEEK(pickup_datetime)*1.0 AS dayofweek,
  HOUR(pickup_datetime)*1.0 AS hourofday,
  pickup_longitude, pickup_latitude, 
  dropoff_longitude, dropoff_latitude,
  passenger_count*1.0 AS passenger_count,
  (tolls_amount + fare_amount) as fare_amount
FROM
  [nyc-tlc:yellow.trips]
WHERE
    trip_distance > 0
    AND fare_amount >= 2.5
    AND pickup_longitude > -78
    AND pickup_longitude < -70
    AND dropoff_longitude > -78
    AND dropoff_longitude < -70
    AND pickup_latitude > 37
    AND pickup_latitude < 45
    AND dropoff_latitude > 37
    AND dropoff_latitude < 45
    AND passenger_count > 0 
  """

  if EVERY_N == None:
    if phase < 2:
      # training
      query = "{0} AND ABS(HASH(pickup_datetime)) % 4 < 2".format(base_query)
    else:
      query = "{0} AND ABS(HASH(pickup_datetime)) % 4 == {1}".format(base_query, phase)
  else:
      query = "{0} AND ABS(HASH(pickup_datetime)) % {1} == {2}".format(base_query, EVERY_N, phase)
    
  return query

def distance_between(lat1, lon1, lat2, lon2):
  # haversine formula to compute distance "as the crow flies".  Taxis can't fly of course.
  dist = np.degrees(np.arccos(np.sin(np.radians(lat1)) * np.sin(np.radians(lat2)) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.cos(np.radians(lon2 - lon1)))) * 60 * 1.515 * 1.609344
  return dist

def estimate_distance(df):
  return distance_between(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

def compute_rmse(actual, predicted):
  return np.sqrt(np.mean((actual-predicted)**2))

def print_rmse(df, rate, name):
  print("{1} RMSE = {0}".format(compute_rmse(df['fare_amount'], rate*estimate_distance(df)), name))

query = create_query(2, 100000)
df_valid = bq.Query(query).to_dataframe()
print_rmse(df_valid, 2.56, 'Final Validation Set')


Final Validation Set RMSE = 8.02608564676

In [7]:
%%mlalpha train --cloud
package_uris:  gs://cloud-training-demos-ml/taxifare/source4b/taxifare.tar.gz
python_module: trainer.task
scale_tier: BASIC
region: us-central1
args:
  train_data_paths: gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/features_train*
  eval_data_paths: gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/features_eval*
  metadata_path: gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/metadata.yaml
  output_path: gs://cloud-training-demos-ml/taxifare/taxi_trained4b/eval
  max_steps: 2500
  hidden_layer1_size: 147
  number_buckets: 19
  learning_rate: 0.047
  batch_size: 512


Out[7]:

Job "trainer_task_161012_214213" was submitted successfully.
Run "%mlalpha jobs --name trainer_task_161012_214213" to view the status of the job.

Click here to view cloud log.
Start TensorBoard by running "%tensorboard start --logdir=<YourLogDir>".


In [9]:
%mlalpha jobs --name trainer_task_161012_212122


Out[9]:
createTime: '2016-10-12T21:21:23Z'
endTime: '2016-10-12T23:25:03Z'
jobId: trainer_task_161012_212122
startTime: '2016-10-12T21:21:33Z'
state: SUCCEEDED
trainingInput:
  args: [--number_buckets, '19', --metadata_path, 'gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/metadata.yaml',
    --batch_size, '512', --eval_data_paths, 'gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/features_eval*',
    --hidden_layer1_size, '147', --output_path, /tmp/temporary, --train_data_paths,
    'gs://cloud-training-demos-ml/taxifare/taxi_preproc4a/features_train*', --max_steps,
    '2500', --learning_rate, '0.047']
  packageUris: ['gs://cloud-training-demos-ml/taxifare/source4b/taxifare.tar.gz']
  pythonModule: trainer.task
  region: us-central1

In [4]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame({'Lab' : pd.Series(['1a', '2c', '3a', '4a', '4b', '4c']),
              'Method' : pd.Series(['Heuristic', 'tf.learn', '+ Feature Eng', '+ Hyperparam', '+ 500m rows']),
              'RMSE': pd.Series([8.026, 10.344, 6.38, 6.28, 3.86]) })

ax = sns.barplot(data=df, x='Method', y='RMSE')
ax.set_ylabel('RMSE (dollars)')
ax.set_xlabel('CPB102 labs (methods)')
plt.plot(np.linspace(-20,120,1000), [5]*1000, 'b')


Out[4]:
[<matplotlib.lines.Line2D at 0x7fc38d738190>]

In [ ]: