In [5]:
import pandas as pd
import numpy as np
In [6]:
series1 = pd.Series(['a', 'b', 'c'])
print(series1[1:3])
In [12]:
# Viewing Specific Keys
record = pd.Series({
'firstname': 'Charles',
'lastname': 'Givre',
'middle': 'classfied'
})
record[['firstname', 'lastname']]
# firstname Charles
# lastname Givre
# String manipulations: Show True/False for each record
record.str.contains('Cha')
# firstname True
# lastname False
# middle False
# Filter based on the returned True/False
record[record.str.contains('Cha')]
# Other string manipulations
# Record.str.contains... functions
# => contains, count, extract, find, findall, len
Out[12]:
In [8]:
# Display head/tail number of records
randomNumbers = pd.Series(
np.random.randint(1, 100, 50)
)
randomNumbers.head(10)
# randomNumbers.tail(7)
Out[8]:
In [9]:
# Filtering Data in a Series
# 1. Generate True/False values for all values
randomNumbers < 10
# 2. Filter based on True/False values and only show records
randomNumbers[randomNumbers < 10]
Out[9]:
In [17]:
small_rands = randomNumbers[randomNumbers < 10]
small_rands
Out[17]:
In [18]:
def addTwo(n):
return n + 2
In [19]:
# iterates through the entire series and apply addTwo function
small_rands.apply(addTwo)
Out[19]:
In [20]:
# Use lambda instead
small_rands.apply(lambda x: x + 1)
Out[20]:
In [32]:
# Remove Missing Values:
Series.dropna()
# Replace missing values with "something" value
Series.fillna(value="<something>"
In [ ]:
data = pd.DataFrame(<data>, <index>, <column_names>)
# 1. Pass two dimentional data -> Series
# 2. Usually, reading from outside sources
data = pd.read_csv(<file>)
data = pd.read_excel('file.xls')
data = pd.read_json(<file>/<url>)
data = pd.read_sql(<query>, <connection_obj>)
data = pd.read_html(<source>)
logdf = pd.read_table('../data/mysql.log', names=['raw'])
# refer to a column:
logdf['raw'].str.extract(
'(?P<data>]d{6}\s\d{2}:\d{2}:\d{2}:\d{2})...',
expand=False
)
In [ ]:
# Web Server Logs
# 1. Complicated to parse - use apache_log_parser package
import apache_log_parser
line_parser = apache_log_parser.make_parser("%h %l %u %t \'%r\' %>s %b \'%{Referer}i\' \'%{User-agent}i\' ")
# pandas are moving to arrow data structure
server_log = open("../data/hackers-access.httpd", "r")
parsed_server_data = []
# loop and add to parsed_server_data and move into a dataframe
In [ ]:
# Manipulating datafarme
df = data['column'] # returns series
df['ip'].value_counts().head() # counts unique IPs!
# return the following columns
df = data[['column1', 'column2', 'column3']] # returns a DataFrame
# extract columns and filter
# 1. show specified columns
# 2. Filter(any column)
df[['col1', 'col2']][col3 > 5]
# pull out individual rows
data.loc[<index>]
data.loc[<list of indexes>]
data.sample[<n>] # return a random sample of a dataset
# Apply to DataFrame
data.apply(<function>)
# - function will receive Series == each row
# - function will return a new row => allows us to add new columns
In [ ]:
# Apply a function to a column and then create a new column
df = pd.read_csv('data/dailybots.csv')
# df['orgs'] + 2 => will add 2 to the entire column
df['orgs2'] = df['orgs'] + 2 # this will create a new column
In [ ]:
# Transpose
data.T => Reshaping data
# aggregation
# 1. the sum of columns
data.sum(axis=0)
# 2. the sum of the rows
data.sum(axis=1) # from operating on a column to row
# drop
# 1. inplace => change the current dataframe
# 2. errors => specify an error
data.drop(labels, axis=0, level=None, inplace=False, errors='raise')
In [ ]:
# Merging datasets
# Union
Series 1 + Series 2
combinedSeries = pd.concat([series1, series2], ...)
# Join
# 1. Inner Join -> common things between sets
# 2. Outer Join ->
# 3. Left Join -> all the data in set A, A and B, but not B
# 4. Right Join -> same
pd.merge(leftData, rightData,
how="<join type / inner,lerft,right,outer>",
on="list of fields")
In [ ]:
# Grouping and Aggregating data
df_grouped = df.groupby(
['Protocol', 'Source', 'Destination']
)
print(df_grouped.size())
stats_packets = df_grouped['Length'].agg({'No Packets': len, 'Volume': sum,
'SD': lambda x: np.std(x, ddoff=1...)})