In [1]:
import math
import numpy as np
import pandas as pd
import re
from operator import itemgetter, attrgetter
In [30]:
def median(dataPoints):
"computer median of given data points"
if not dataPoints:
raise 'no datapoints passed'
sortedpoints=sorted(dataPoints)
mid=len(dataPoints)//2
#even
#print mid , sortedpoints
if len(dataPoints)%2==0:
return (sortedpoints[mid-1] + sortedpoints[mid])/2.0
else:
# odd
return sortedpoints[mid]
def range(dataPoints):
"compute range of given data points"
if not dataPoints:
raise 'no datapoints passed'
return max(dataPoints)-mean(dataPoints)
def quartiles(dataPoints):
"computer first and last quartile in the datalist"
if not dataPoints:
raise 'no datapoints passed'
sortedpoints=sorted(dataPoints)
mid=len(dataPoints)//2
#even
if(len(dataPoints)%2==0):
print sortedpoints[:mid]
lowerQ=median(sortedpoints[:mid])
upperQ=median(sortedpoints[mid:])
else:
lowerQ=median(sortedpoints[:mid])
upperQ=median(sortedpoints[mid+1:])
return lowerQ,upperQ
def summary(dataPoints):
"print stat summary of data"
if not dataPoints:
raise 'no datapoints passed'
print "Summary Statistics:"
print ("Min : " , min(dataPoints))
print ("First Quartile : ",quartiles(dataPoints)[0] )
print ("median : ", median(dataPoints))
print ("Second Quartile : ", quartiles(dataPoints)[1])
print ("max : ", max(dataPoints))
return ""
In [31]:
datapoints=[68, 83, 58, 84, 100, 64]
#quartiles(datapoints)
print summary(datapoints)
Question:
Write a program that calculates and prints the value according to the given formula:
Q = Square root of [(2 * C * D)/H]
Following are the fixed values of C and H:
C is 50. H is 30.
D is the variable whose values should be input to your program in a comma-separated sequence.
Example
Let us assume the following comma separated input sequence is given to the program:
100,150,180
The output of the program should be:
18,22,24
In [64]:
C=50
H=30
def f1(inputList):
answer= [math.sqrt((2*C*num*1.0)/H) for num in inputList]
return ','.join(str (int(round(num))) for num in answer)
string='100,150,180'
nums=[int(num ) for num in string.split(',')]
type(nums)
print f1(nums)
Question:
Write a program which takes 2 digits, X,Y as input and generates a 2-dimensional array. The element value in the i-th row and j-th column of the array should be i*j.
Note: i=0,1.., X-1; j=0,1,¡Y-1.
Example
Suppose the following inputs are given to the program:
3,5
Then, the output of the program should be:
[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4], [0, 2, 4, 6, 8]]
In [65]:
dimensions=[3,5]
rows=dimensions[0]
columns=dimensions[1]
array=np.zeros((rows,columns))
#print array
for row in range(rows):
for column in range(columns):
array[row][column]=row*column
print array
Question:
Write a program that accepts a comma separated sequence of words as input and prints the words in a comma-separated sequence after sorting them alphabetically.
Suppose the following input is supplied to the program:
without,hello,bag,world
Then, the output should be:
bag,hello,without,world
In [66]:
string='without,hello,bag,world'
wordList=string.split(',')
wordList.sort()
#print wordList
print ','.join(word for word in wordList)
``
Question: A website requires the users to input username and password to register. Write a program to check the validity of password input by users. Following are the criteria for checking the password:
``
In [67]:
def check_password(items):
values=[]
for string in items:
if len(string) < 6 and len(string)> 12:
continue
else :
pass
if not re.search('[a-z]',string):
continue
elif not re.search('[0-9]',string):
continue
elif not re.search('[A-Z]',string):
continue
elif not re.search('[$#@]',string):
continue
elif re.search('\s',string):
continue
else :pass
values.append(string)
return ','.join(pwd for pwd in values)
In [68]:
string='ABd1234@1,a F1#,2w3E*,2We3345 '
items=string.split(',')
print check_password(items)
Question:
You are required to write a program to sort the (name, age, height) tuples by ascending order where name is string, age and height are numbers. The tuples are input by console. The sort criteria is:
1: Sort based on name;
2: Then sort based on age;
3: Then sort by score.
The priority is that name > age > score.
If the following tuples are given as input to the program:
Tom,19,80
John,20,90
Jony,17,91
Jony,17,93
Json,21,85
Then, the output of the program should be:
[('John', '20', '90'), ('Jony', '17', '91'), ('Jony', '17', '93'), ('Json', '21', '85'), ('Tom', '19', '80')]
In [69]:
string= 'Tom,19,80 John,20,90 Jony,17,91 Jony,17,93 Json,21,85'
items= [ tuple(item.split(',')) for item in string.split(' ')]
print sorted(items, key=itemgetter(0,1,2))
Question:
Write a program to compute the frequency of the words from the input. The output should output after sorting the key alphanumerically.
Suppose the following input is supplied to the program:
New to Python or choosing between Python 2 and Python 3? Read Python 2 or Python 3.
Then, the output should be:
2:2
3.:1
3?:1
New:1
Python:5
Read:1
and:1
between:1
choosing:1
or:2
to:1
In [70]:
string='New to Python or choosing between Python 2 and Python 3? Read Python 2 or Python 3.'
freq={}
for word in string.split(' '):
freq[word]=freq.get(word,0)+1
words=freq.keys()
for item in sorted(words):
print "%s:%d" %(item,freq.get(item))
In [ ]:
Some exercises related to using pandas for dataframe operations
The source of this exercises is at : https://github.com/ajcr/100-pandas-puzzles/blob/master/100-pandas-puzzles-with-solutions.ipynb
In [73]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
In [75]:
# Create a DataFrame df from this dictionary data which has the index labels.
df = pd.DataFrame(data,index=labels)
#display summary of the basic information
df.info()
df.describe()
Out[75]:
In [85]:
# return first 3 , last 3 rows of dataframe
print df.head(3)
#df.iloc[:3]
print ' '
print df.iloc[-3:]
#print df.tail(3)
In [89]:
# Select just the 'animal' and 'age' columns from the DataFrame df.
df[['animal','age']]
#df.loc[:,['animal','age']]
Out[89]:
In [90]:
#Select the data in rows [3, 4, 8] and in columns ['animal', 'age'].
df.loc[df.index[[3,4,8]], ['animal','age']]
Out[90]:
In [91]:
# Select only the rows where the number of visits is greater than 3.
df[df['visits']>3]
Out[91]:
In [92]:
# Select the rows where the age is missing, i.e. is NaN.
df[df['age'].isnull()]
Out[92]:
In [95]:
#Select the rows where the animal is a cat and the age is less than 3.
df[ (df['animal']=='cat') & (df['age'] <3) ]
Out[95]:
In [97]:
#Select the rows the age is between 2 and 4 (inclusive).
df[df['age'].between(2,4)]
Out[97]:
In [98]:
#Change the age in row 'f' to 1.5
df.loc['f','age']=1.5
In [100]:
#Calculate the sum of all visits (the total number of visits).
df['visits'].sum()
Out[100]:
In [102]:
#Calculate the mean age for each different animal in df.
df.groupby('animal')['age'].mean()
Out[102]:
In [104]:
# Append a new row 'k' to df with your choice of values for each column. Then delete that row to return the original DataFrame.
df.loc['k'] = [5.5, 'dog', 'no', 2]
# and then deleting the new row...
df = df.drop('k')
In [106]:
# Count the number of each type of animal in df.
df['animal'].value_counts()
Out[106]:
In [109]:
#Sort df first by the values in the 'age' in decending order, then by the value in the 'visit' column in ascending order.
df.sort_values(by=['age','visits'], ascending=[False,True])
Out[109]:
In [114]:
# The 'priority' column contains the values 'yes' and 'no'.
#Replace this column with a column of boolean values: 'yes' should be True and 'no' should be False.
df['priority']=df['priority'].map({'yes': True, 'no':False})
In [115]:
# In the 'animal' column, change the 'snake' entries to 'python'.
df['animal']= df['animal'].replace({'snake': 'python'})
In [116]:
# For each animal type and each number of visits, find the mean age.
#In other words, each row is an animal, each column is a number of visits and the values are the mean ages
#(hint: use a pivot table).
In [120]:
df.pivot_table(index='animal', columns='visits', values='age' , aggfunc='mean')
Out[120]:
In [122]:
# You have a DataFrame df with a column 'A' of integers. For example:
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
#How do you filter out rows which contain the same integer as the row immediately above?
In [124]:
df.loc[df['A'].shift() != df['A']]
Out[124]:
In [125]:
#Given a DataFrame of numeric values, say
df = pd.DataFrame(np.random.random(size=(5, 3))) # a 5x3 frame of float values
#how do you subtract the row mean from each element in the row?
In [135]:
#print df
# axis=1 means row wise , axis=0 means columnwise
df.sub(df.mean(axis=1), axis=0)
Out[135]:
In [136]:
#Suppose you have DataFrame with 10 columns of real numbers, for example:
df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
#Which column of numbers has the smallest sum? (Find that column's label.)
In [141]:
#print df.sum(axis=0)
df.sum(axis=0).idxmin()
Out[141]:
In [144]:
# How do you count how many unique rows a DataFrame has (i.e. ignore all rows that are duplicates)?
len(df) - df.duplicated(keep=False).sum()
# better is
print len(df.duplicated(keep=False))
In [145]:
#You have a DataFrame that consists of 10 columns of floating--point numbers.
#Suppose that exactly 5 entries in each row are NaN values.
#For each row of the DataFrame, find the column which contains the third NaN value.
#(You should return a Series of column labels.)
In [153]:
(df.isnull().cumsum(axis=1)==3).idxmax(axis=1)
Out[153]:
In [159]:
# A DataFrame has a column of groups 'grps' and and column of numbers 'vals'. For example:
df = pd.DataFrame({'grps': list('aaabbcaabcccbbc'),
'vals': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})
#For each group, find the sum of the three greatest values.
In [168]:
df.groupby('grps')['vals'].nlargest(3).sum(level=0)
Out[168]:
In [169]:
#A DataFrame has two integer columns 'A' and 'B'. The values in 'A' are between 1 and 100 (inclusive).
#For each group of 10 consecutive integers in 'A' (i.e. (0, 10], (10, 20], ...),
#calculate the sum of the corresponding values in column 'B'.
In [ ]:
In [171]:
# 1. Write a Python program to print the NumPy version in your system.
print (np.__version__)
In [172]:
#2. Write a Python program to count the number of characters (character frequency) in a string.
l = [12.23, 13.32, 100, 36.32]
print 'original list: ' , l
print 'numpy array : ', np.array(l)
In [175]:
#Create a 3x3 matrix with values ranging from 2 to 10.
np.arange(2,11).reshape(3,3)
Out[175]:
In [ ]: