Using Pandas on various datasets for Statistical Operations
Using various Python Visualization libraries to improve our understanding of the Data
In [71]:
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
ckey='pD6v5kfp67FXY2NASfowDazRJ'
csecret='7iZO2rFiMDUwOqOfQVJF4RfVxbQtoapQ1eTMUeFs1e8szTaDVy'
# The access tokens can be found on your applications's Details
# page located at https://dev.twitter.com/apps (located
# under "Your access token")
atoken='3414048620-HZauU3mJwiI7TBrUluFGCPKn4k4UdZviNty6KcD'
asecret='OlZbg7WTAThOnnMHpWKfK82XOKbZ2Sx0Kh082r7uGb3PX'
# Tweepy
class listener(StreamListener):
def on_data (self, data):
print(data)
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(ckey, csecret)
auth.set_access_token(atoken, asecret)
import time
start_time = time.time()
ls = []
while True:
twitterStream = Stream(auth, listener())
ls.append(twitterStream.filter(track = ["car"]))
if(( time.time() - start_time ) < 0.009):
break
In [2]:
import nltk
from nltk.probability import ELEProbDist, FreqDist, DictionaryProbDist
from nltk import NaiveBayesClassifier
from nltk import FreqDist, ConditionalFreqDist
from nltk import BigramAssocMeasures
from collections import defaultdict
In [5]:
pos_tweets = [('Attention all Nature Lovers - Cattle Cruelty in India & Rescues', 'positive'),
('Are you having debate on Poojari Lynching for try to save cow slaughter?', 'positive'),
('Hope People start loving all animals like this & not show', 'positive'),
('if slaughter houses had glass walls everyone would be a vegetarian', 'positive'),
('BanBeef SayNOtoMEATexport PlunderOfIndia SaveGauVansh Cow Vegetarian JagoBharathJag', 'positive'),
('I will eat beef and pork for religious reasons, occasionally. I will be vegetarian for ethical reasons, frequently. #meatban #vegetarian','positive')
]
neg_tweets = [('Let try to ban hunger before we ban meat?', 'negative'),
('meatban causing price of pulses at 200+/kg', 'negative'),
('Where is Indian Politics heading to? Chicken, mutton or beef now parliament will approve the dinner', 'negative'),
('There is something truly secretly delicious about having your mouth enjoy a perfect BLT in a country with meat ban', 'negative'),
('We will let loose 100 pigs in Jama Masjid if the meatban was not enforced on 9 days of Navratri', 'negative')]
test_tweet = [('A Question: Can someone please tell me how jhatka came into being in answer to halal in India? Please enlighten','negative')]
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split()
if len(e) >= 3]
tweets.append((words_filtered, sentiment))
#print(tweets)
test_tweets = []
for (words, sentiment) in test_tweet:
words_filtered = [e.lower() for e in words.split()
if len(e) >= 3]
test_tweets.append((words_filtered, sentiment))
#print(test_tweets)
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
#print(word_features)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
a = extract_features(test_tweet)
print(a)
training_set = nltk.classify.apply_features(extract_features, tweets)
print(training_set)
classifier = nltk.NaiveBayesClassifier.train(training_set)
def train(labeled_featuresets, estimator= ELEProbDist):
...
# Create the P(label) distribution
label_freqdist = ConditionalFreqDist()
label_probdist = estimator(label_freqdist)
...
# Create the P(fval|label, fname) distribution
feature_probdist = {}
...
return NaiveBayesClassifier(label_probdist, feature_probdist)
#!print(label_probdist.prob('positive'))
#!print(feature_probdist)
print(classifier.show_most_informative_features(32))
#tweet = 'Meat Ban reminds me of TV Ban. If one of your siblings was taking the board exams, you cannot watch too'
#'And I support Cow,Buffalo #meatban if #India returns to #Swadeshi #agriculture Invest in Agriculture to #SaveIndia'
#print(classifier.classify(extract_features(tweet.split())))
In [13]:
import os
#os.chdir("/home/archimedeas/wrkspc/anaconda/the-visual-verdict/visualizations/1_the_senate/datasets")
os.getcwd()
Out[13]:
In [17]:
os.listdir()
Out[17]:
In [15]:
os.chdir('..')
In [19]:
os.getcwd()
os.chdir('the_senate_datasets')
In [20]:
os.listdir()
Out[20]:
In [21]:
import pandas as pd
df_men = pd.read_csv("1_age_group_5yr_span.csv")
df_men
Out[21]:
In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10.0, 8.0)
In [29]:
ls_labels_men = []
ls_values_men = []
for i in range(1,df_men.shape[0]):
ls_labels_men.append(str(df_men.iat[i,0]))
ls_values_men.append(float(df_men.iat[i,1]))
In [30]:
import numpy as np
import matplotlib.pyplot as plt
N = len(ls_labels_men)
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, ls_values_men, width, color = 'red', alpha = 0.6)
#rects2 = ax.bar(ind + width, ls_values_men, width )
# add some text for labels, title and axes ticks
ax.set_ylabel('Scores')
ax.set_title('Scores by group and gender')
ax.set_xticks(ind + width)
ax.set_xticklabels(ls_labels_men)
#ax.legend((rects1[0], rects2[0]), ('Men', 'Women'))
plt.show()
In [31]:
import pandas as pd
df_men = pd.read_csv("4_educational_background.csv")
In [32]:
df_men
Out[32]:
In [33]:
ls_labels_men = []
ls_values_men = []
for i in range(1,df_men.shape[0]):
ls_labels_men.append(str(df_men.iat[i,0]))
ls_values_men.append(float(df_men.iat[i,1]))
In [34]:
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] = (10.0, 10.0)
# The slices will be ordered and plotted counter-clockwise.
labels = ls_labels_men
sizes = ls_values_men
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'red', 'lightgreen']
explode = (0.1, 0.1, 0.1, 0.1, 0.1, .1)
p, text = plt.pie(sizes, colors = colors, explode = explode, shadow=True, startangle=90 )
# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
#plt.title("Educational Background", fontsize = 50, loc = 'right')
plt.legend(p, labels, loc= 'lower right')
plt.show()
In [47]:
os.getcwd()
Out[47]:
In [50]:
os.chdir('..')
In [45]:
os.getcwd()
Out[45]:
In [52]:
os.chdir('processed_census_datasets')
In [53]:
import pandas as pd
df = pd.read_csv("processed-population-and-education_DDW-0000C-10.xlsx - C-10 SC.csv")
df
Out[53]:
In [54]:
df.shape
Out[54]:
In [55]:
os.listdir()
Out[55]:
In [30]:
from bokeh._legacy_charts import Bar
from bokeh.io import output_notebook, show
# get the countries and we group the data by medal type
states = ['delhi', 'assam']
delhi = [ [56,46],
[23,77],
[45,55],
[60,40],
[35,15,25,25]
]
assam = [ [30,10],
[33,67],
[75,25],
[50,50],
[75,5,10,10]
]
output_notebook()
bar = Bar([delhi[0],assam[0]], states, title="Stacked bars", stacked=True)
bar2 = Bar([delhi[0],assam[0]], states, title="Stacked bars")
show(bar)
In [58]:
from collections import OrderedDict
from math import log, sqrt
import numpy as np
import pandas as pd
from six.moves import cStringIO as StringIO
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
antibiotics = """
attitude, Positive, Negative, Dont_care, gram
ASSAM, 80, 5, 5, negative
DELHI, 10, 0.8, 0.09, negative
INDIA, 3, 0.1, 0.1, positive"""
drug_color = OrderedDict([
("Dont_care", "#0d3362"),
("Positive", "#c64737"),
("Negative", "black" ),
])
gram_color = {
"positive" : "#aeaeb8",
"negative" : "#e69584",
}
df = pd.read_csv(StringIO(antibiotics),
skiprows=1,
skipinitialspace=True,
engine='python')
width = 800
height = 800
inner_radius = 90
outer_radius = 300 - 10
minr = sqrt(log(.001 * 1E4))
maxr = sqrt(log(1000 * 1E4))
a = (outer_radius - inner_radius) / (minr - maxr)
b = inner_radius - a * maxr
def rad(mic):
return a * np.sqrt(np.log(mic * 1E4)) + b
big_angle = 2.0 * np.pi / (len(df) + 1)
small_angle = big_angle / 7
x = np.zeros(len(df))
y = np.zeros(len(df))
#output_file("burtin.html", title="burtin.py example")
output_notebook()
p = figure(plot_width=width, plot_height=height, title="",
x_axis_type=None, y_axis_type=None,
x_range=[-420, 420], y_range=[-420, 420],
min_border=0, outline_line_color="black",
background_fill="#f0e1d2", border_fill="#f0e1d2")
p.line(x+1, y+1, alpha=0)
# annular wedges
angles = np.pi/2 - big_angle/2 - df.index.to_series()*big_angle
colors = [gram_color[gram] for gram in df.gram]
p.annular_wedge(
x, y, inner_radius, outer_radius, -big_angle+angles, angles, color=colors,
)
# small wedges
p.annular_wedge(x, y, inner_radius, rad(df.Dont_care),
-big_angle+angles+5*small_angle, -big_angle+angles+6*small_angle,
color=drug_color['Dont_care'])
p.annular_wedge(x, y, inner_radius, rad(df.Positive),
-big_angle+angles+3*small_angle, -big_angle+angles+4*small_angle,
color=drug_color['Positive'])
p.annular_wedge(x, y, inner_radius, rad(df.Negative),
-big_angle+angles+1*small_angle, -big_angle+angles+2*small_angle,
color=drug_color['Negative'])
# circular axes and lables
labels = np.power(10.0, np.arange(-3, 4))
radii = a * np.sqrt(np.log(labels * 1E4)) + b
p.circle(x, y, radius=radii, fill_color=None, line_color="white")
p.text(x[:-1], radii[:-1], [str(r) for r in labels[:-1]],
text_font_size="8pt", text_align="center", text_baseline="middle")
# radial axes
p.annular_wedge(x, y, inner_radius-10, outer_radius+10,
-big_angle+angles, -big_angle+angles, color="black")
# attitude labels
xr = radii[0]*np.cos(np.array(-big_angle/2 + angles))
yr = radii[0]*np.sin(np.array(-big_angle/2 + angles))
label_angle=np.array(-big_angle/2+angles)
label_angle[label_angle < -np.pi/2] += np.pi # easier to read labels on the left side
p.text(xr, yr, df.attitude, angle=label_angle,
text_font_size="9pt", text_align="center", text_baseline="middle")
# OK, these hand drawn legends are pretty clunky, will be improved in future release
p.circle([-40, -40], [-370, -390], color=list(gram_color.values()), radius=5)
p.text([-30, -30], [-370, -390], text=[ "National", "States" ],
text_font_size="7pt", text_align="left", text_baseline="middle")
p.rect([-40, -40, -40], [18, 0, -18], width=30, height=13,
color=list(drug_color.values()))
p.text([-15, -15, -15], [18, 0, -18], text=list(drug_color.keys()),
text_font_size="9pt", text_align="left", text_baseline="middle")
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
show(p)
In [46]:
from bokeh.io import output_file, show
output_file('tale_of_cities.html')
show(p)
In [32]:
from IPython.display import Image
image0 = Image('IndiaSelected.jpg')
image1 = Image('BanSectors.jpg')
image2 = Image('StatesSelected.jpg')
image3 = Image('BanTimeLine.jpg')
In [37]:
image3
Out[37]:
In [19]:
from IPython.display import Image, HTML, display
from glob import glob
imagesList=''.join( ["<img style='width: 180px; margin: 0px; float: left; border: 1px solid black;' src='%s' />" % str(s)
for s in glob('*.jpg') ])
display(HTML(imagesList))
In [67]:
os.chdir('C:\\Users\\user\\Desktop\\7th Sem Project\\Jupyter')
In [68]:
os.listdir()
Out[68]:
In [70]:
from glob import glob
glob('*.png')
Out[70]:
In [ ]: