In [1]:
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
In [ ]:
In [ ]:
In [17]:
train_data = pd.read_csv('data/sorted-entropy-features-apt.csv')
train_labels = pd.read_csv('data/sorted-train-labels-apt.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
In [4]:
train_labels.head()
Out[4]:
In [18]:
y.head()
Out[18]:
In [19]:
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
color = [ item/255 for item in ya]
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[19]:
In [20]:
train_data = pd.read_csv('data/sorted-entropy-features-vs251.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()
Out[20]:
In [21]:
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[21]:
In [22]:
train_data = pd.read_csv('data/sorted-entropy-features-vs252.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()
Out[22]:
In [23]:
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[23]:
In [24]:
train_data = pd.read_csv('data/sorted-entropy-features-vs263.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()
Out[24]:
In [25]:
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[25]:
In [26]:
train_data = pd.read_csv('data/sorted-entropy-features-vs264.csv')
train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv')
X = train_data.iloc[:,1:]
y = train_labels.iloc[:,4]
y.head()
Out[26]:
In [27]:
plt.figure(figsize=(15,15))
plt.xlabel("Entropy")
plt.ylabel("File Size")
xa = np.array(X['entropy'])
xb = np.array(X['file_size'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[27]:
In [29]:
vcounts = y.value_counts()
In [33]:
plt.figure(figsize=(15,15))
plt.xlabel("Family Label")
plt.ylabel("Label Count")
plt.scatter(vcounts.index, vcounts,c=vcounts.index,cmap='brg')
Out[33]:
In [ ]:
In [ ]:
In [ ]:
In [1]:
import urllib2
from BeautifulSoup import BeautifulSoup
page = urllib2.urlopen('http://www.google.com/')
soup = BeautifulSoup(page)
In [13]:
page = urllib2.urlopen('http://www.prehnite.com.au/')
soup = BeautifulSoup(page)
x = soup.body.findAll('img')
In [17]:
x = soup.body.findAll('img')
In [20]:
for line in page:
if "<img" in line:
print("img tag -> {:s}".format(line))
In [24]:
htmllines = page.read()
In [43]:
import urllib2
import re
response = urllib2.urlopen('http://adskeeper.co.uk/ad10300.html?site_id=1187204')
#print "Response:", response
# Get the URL. This gets the real URL.
#print "The URL is: ", response.geturl()
# Getting the code
#print "This gets the code: ", response.code
# Get the Headers.
# This returns a dictionary-like object that describes the page fetched,
# particularly the headers sent by the server
#print "The Headers are: ", response.info()
# Get the date part of the header
#print "The Date is: ", response.info()['date']
# Get the server part of the header
#print "The Server is: ", response.info()['server']
# Get all data
html = response.read()
#print "Get all data: ", html
lines = html.split('\n')
# Get only the length
#print "Get the length :", len(html)
purl = re.compile('.+src="(.+\.jpg)".*')
pjpg = re.compile('.+/(\w+\.jpg)".*')
# Showing that the file object is iterable
for line in lines:
if "img" in line:
print line.rstrip()
jpgurl = purl.match(line)
jpgname = pjpg.match(line)
if jpgurl != None:
image_url = jpgurl.group(1)
print image_url
if jpgname != None:
image_name = jpgname.group(1)
print image_name
jpgresponse = urllib2.urlopen(image_url)
jpgdata = jpgresponse.read()
of = open('data/'+image_name, 'wb')
of.write(jpgdata)
of.close()
In [42]:
purl = re.compile('.+src="(.+\.jpg)".*')
pjpg = re.compile('.+/(\w+\.jpg)".*')
# Showing that the file object is iterable
for line in lines:
if "img" in line:
print line.rstrip()
jpgurl = purl.match(line)
jpgname = pjpg.match(line)
if jpgurl != None:
image_url = jpgurl.group(1)
print image_url
if jpgname != None:
image_name = jpgname.group(1)
print image_name
jpgresponse = urllib2.urlopen("http://www.prehnite.com.au/"+image_url)
jpgdata = jpgresponse.read()
of = open('data/'+image_name, 'wb')
of.write(jpgdata)
of.close()
In [ ]:
In [ ]:
print html
In [ ]:
help(soup)