In [3]:
import os
import glob

In [4]:
os.getcwd()


Out[4]:
'C:\\Users\\Dell'

In [5]:
path = 'C:\\Users\\Dell\\Downloads'

In [6]:
extension = 'csv'
os.chdir(path)

In [7]:
result = [i for i in glob.glob('*.{}'.format(extension))]
print(result)


['AirPassengers.csv', 'BigDiamonds.csv', 'Boston (1).csv', 'Boston.csv', 'ccFraud.csv', 'class2.csv', 'data1.csv', 'datasets.csv', 'Diamond (1).csv', 'Diamond (2).csv', 'Diamond (3).csv', 'Diamond (4).csv', 'Diamond (5).csv', 'Diamond (6).csv', 'Diamond (7).csv', 'Diamond (8).csv', 'Diamond.csv', 'Hdma.csv', 'Hedonic.csv', 'pgd.csv', 'protein.csv', 'RidingMowers.csv', 'sales-of-shampoo-over-a-three-ye.csv', 'telecom.csv']

In [8]:
import pandas as pd

In [40]:
iris=pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv")

In [45]:
iris.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     150 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null float64
Species         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB

In [104]:
import seaborn as sns
%matplotlib inline

In [46]:
import matplotlib.pyplot as plt 
%matplotlib inline

In [49]:
plt.bar(iris['Sepal.Length'],iris['Sepal.Width'],label="bar1",color='r')


Out[49]:
<Container object of 150 artists>

In [50]:
plt.bar(iris['Petal.Length'],iris['Petal.Width'],label="bar1",color='g')


Out[50]:
<Container object of 150 artists>

In [52]:
fig=plt.figure()


<matplotlib.figure.Figure at 0xd110f60>

In [53]:
ax1=fig.add_subplot(1,2,1)
ax2=fig.add_subplot(1,2,2)

In [58]:
ax1.boxplot(iris['Sepal.Length'])
ax1.set_xlabel('Sepal.Length')
plt.show()

In [59]:
ax2.boxplot(iris['Petal.Length'])
ax2.set_xlabel('Petal.Length')
plt.show()

In [60]:
plt.boxplot(iris['Petal.Length'])


Out[60]:
{'boxes': [<matplotlib.lines.Line2D at 0xd434320>],
 'caps': [<matplotlib.lines.Line2D at 0xd43add8>,
  <matplotlib.lines.Line2D at 0xd43af60>],
 'fliers': [<matplotlib.lines.Line2D at 0xd43ffd0>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0xd43f7f0>],
 'whiskers': [<matplotlib.lines.Line2D at 0xd434cf8>,
  <matplotlib.lines.Line2D at 0xd434eb8>]}

In [61]:
plt.hist(iris['Sepal.Length'])


Out[61]:
(array([  9.,  23.,  14.,  27.,  16.,  26.,  18.,   6.,   5.,   6.]),
 array([ 4.3 ,  4.66,  5.02,  5.38,  5.74,  6.1 ,  6.46,  6.82,  7.18,
         7.54,  7.9 ]),
 <a list of 10 Patch objects>)

In [64]:
plt.scatter(iris['Petal.Length'],iris['Sepal.Length'])


Out[64]:
<matplotlib.collections.PathCollection at 0x15efc668>

In [ ]:


In [67]:
slices=pd.value_counts(iris.Species)
print(slices)


virginica     50
setosa        50
versicolor    50
Name: Species, dtype: int64

In [71]:
labels=pd.Series(iris.Species.unique())
print(labels)


0        setosa
1    versicolor
2     virginica
dtype: object

In [ ]:
colors=['r','y','g']

In [83]:
plt.pie(pd.value_counts(iris.Species),labels=['virginica','versicolor','setosa'],colors=['r','y','g'],autopct='%1.1f%%')


Out[83]:
([<matplotlib.patches.Wedge at 0x19c3fb70>,
  <matplotlib.patches.Wedge at 0x19c3cda0>,
  <matplotlib.patches.Wedge at 0x19ca6f98>],
 [<matplotlib.text.Text at 0x19c3c3c8>,
  <matplotlib.text.Text at 0x19ca65c0>,
  <matplotlib.text.Text at 0x19cb37b8>],
 [<matplotlib.text.Text at 0x19c3c860>,
  <matplotlib.text.Text at 0x19ca6a58>,
  <matplotlib.text.Text at 0x19cb3c50>])

In [84]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
np.random.seed(sum(map(ord, "aesthetics")))

In [9]:
os.listdir()


Out[9]:
['140749_2017.pdf',
 '2011-F01-0700-Rev4-MDDS.XLSX',
 '20150817143155.pdf',
 '20160111060911.pdf',
 '20170214052225.pdf',
 '7z1604-x64.exe',
 '7z1604.exe',
 '861415_10151432783238421_2124270505_o (1).jpg',
 '861415_10151432783238421_2124270505_o.jpg',
 'AirPassengers.csv',
 'ajayo.jpg',
 'Alison Python  Invoice   - Sheet1.pdf',
 'Alison SAS  Invoice   - Sheet1.pdf',
 'All+CSV+Files+in+a+Folder.ipynb',
 'Allison Interview Jones Invoice   - Sheet1.pdf',
 'Anaconda3-4.2.0-Windows-x86_64.exe',
 'anscombe+dataset.ipynb',
 'apachehttpd.exe',
 'April invoice adaptive analytics   - Sheet1.pdf',
 'Assignment14_BusinessAnalytics (1).docx',
 'Assignment14_BusinessAnalytics.docx',
 'Assignment15_BusinessAnalytics.docx',
 'Assignment16_BusinessAnalytics (1).docx',
 'Assignment16_BusinessAnalytics (2).docx',
 'Assignment16_BusinessAnalytics.docx',
 'aug ust 2008.JPG',
 'avast_free_antivirus_setup_online.exe',
 'avinash_ltv.zip',
 'BigDiamonds.csv',
 'BigDiamonds.csv (1).zip',
 'BigDiamonds.csv (2)',
 'BigDiamonds.csv (2).zip',
 'BigDiamonds.csv (3).zip',
 'BigDiamonds.csv.zip',
 'Boston (1).csv',
 'Boston.csv',
 'CAM- Ajay Ohri (1).pdf',
 'CAM- Ajay Ohri.pdf',
 'camtasia.exe',
 'ccFraud.csv',
 'Certificate of Incorporation - U74999DL2015PTC282030 (26 June 2015).pdf',
 'CHAP1-6PythonforRUsersAnapproachforDataScience.docx',
 'chapter+3+_+spark.html',
 'chi+square+test.ipynb',
 'chromeinstall-8u111.exe',
 'Cisco_WebEx_Add-On.exe',
 'class2.csv',
 'Collabera Invoice (1).pdf',
 'Collabera Invoice.pdf',
 'Collectcent Invoice.pdf',
 'college degrees.pdf',
 'DAP 1.pdf',
 'DAP 1.pptx',
 'DAP 6 RDBMS and SQL.pdf',
 'DAP 6 RDBMS and SQL.pptx',
 'Data Analysis (1).7z',
 'Data Analysis (1).rar',
 'Data Analysis (2).rar',
 'Data Analysis (3).rar',
 'Data Analysis.rar',
 'Data Viz.pptx',
 'data+exploration.ipynb',
 'data+manipulation.ipynb',
 'data+munging+again.ipynb',
 'data+wrangling+titanic+dataset.ipynb',
 'data1.csv',
 'datasets.csv',
 'Decision Trees.pdf',
 'DecisionStatsOfferLetter.docx',
 'DecisionStatsRelievingLetter.docx',
 'descriptive+stats+in+Python.ipynb',
 'desktop.ini',
 'Diamond (1).csv',
 'Diamond (2).csv',
 'Diamond (3).csv',
 'Diamond (4).csv',
 'Diamond (5).csv',
 'Diamond (6).csv',
 'Diamond (7).csv',
 'Diamond (8).csv',
 'Diamond.csv',
 'DolbyVoiceClient.msi',
 'DropboxInstaller.exe',
 'edb_npgsql.exe',
 'edb_pgjdbc.exe',
 'edb_psqlodbc.exe',
 'edb_psqlodbc.exe-20170203172812',
 'edb_psqlodbc.exe-20170307203617',
 'final invoice edureka  - Sheet1.pdf',
 'FinalPythonforRUsersAnapproachforDataScience (1).docx',
 'FinalPythonforRUsersAnapproachforDataScience (2).docx',
 'FinalPythonforRUsersAnapproachforDataScience (3).docx',
 'FinalPythonforRUsersAnapproachforDataScience (4).docx',
 'FinalPythonforRUsersAnapproachforDataScience.docx',
 'final_webinar (1).pdf',
 'final_webinar.pdf',
 'Git-2.11.0-64-bit.exe',
 'Git-2.12.0-64-bit.exe',
 'GitHubSetup (1).exe',
 'GitHubSetup (2).exe',
 'GitHubSetup.exe',
 'GOMAUDIOGLOBALSETUP.EXE',
 'Hdma.csv',
 'Hedonic.csv',
 'HP Downloads',
 'HPSupportSolutionsFramework-12.5.32.203.exe',
 'image.png',
 'IMS PROSCHOOL Workshop.pptx.pdf',
 'IMS PROSCHOOL Workshop.pptx.pptx',
 'internship.docx',
 'Introduction to SAS (1).pdf',
 'Introduction to SAS Part 1 (1).pdf',
 'Introduction to SAS Part 1.pdf',
 'Introduction to SAS.pdf',
 'introductory+python.ipynb',
 'Invoice for Digital Vidya.pdf',
 'Invoice for Weekendr.pdf',
 'Invoice format - Ajay Ohri CONTATA (1).xls',
 'Invoice format - Ajay Ohri CONTATA.xls',
 'invoice rapid miner.pdf',
 'Invoice trafla format.docx',
 'iris2 (1).ipynb',
 'iris2 (2).ipynb',
 'iris2.ipynb',
 'January invoice Indicus  .pdf',
 'June AV   Invoice   - Sheet1.pdf',
 'Lecture 6 - KNN & Naive Bayes.ppt',
 'Local Disk (C) - Shortcut.lnk',
 'logistic regression - script for ppt.R',
 'logistic_regression_-_script_for_ppt.html',
 'lyncentry.exe',
 'March invoice Indicus   - Sheet1.pdf',
 'matplotlib+cars.ipynb',
 'matplotlib+line+graph.ipynb',
 'mongodb-win32-x86_64-2008plus-ssl-3.4.2-signed.msi',
 'mongodb-win32-x86_64-3.4.2-signed.msi',
 'mortDefault',
 'mortDefault.zip',
 'mtcarslm.R',
 'multiple+file+concat+in+pandas (1).ipynb',
 'multiple+file+concat+in+pandas.ipynb',
 'my+first+class+in+python.ipynb',
 'nltk.ipynb',
 'notebook-Copy1.html',
 'Offer Letter - Ajay Ohri (1).pdf',
 'Offer Letter - Ajay Ohri.pdf',
 'Other Data Mining  Methods (1).pdf',
 'Other Data Mining  Methods.pdf',
 'output1 (1).xls',
 'output1 (2).xls',
 'output1.xls',
 'pandas+11.ipynb',
 'pandas+analysis+1.ipynb',
 'pandas+data+manipulation.ipynb',
 'passport image.pdf',
 'Pawconinvoice2016.pdf',
 'Pawconinvoice2017 (1).pdf',
 'Pawconinvoice2017 (2).pdf',
 'Pawconinvoice2017 (3).pdf',
 'Pawconinvoice2017.pdf',
 'Payslip Feb 2016 - Sheet1.pdf',
 'Payslip Feb 2016.pdf',
 'Payslip Format Decisionstats - Sheet1.pdf',
 'Payslip Jan 2016 - Sheet1.pdf',
 'Payslip Jan 2016.pdf',
 'Payslip March 2016 - Sheet1.pdf',
 'Payslip March 2016.pdf',
 'pgd.csv',
 'postgresql-9.6.1-1-windows-x64.exe',
 'Program 1-results.rtf',
 'protein.csv',
 'python+with+postgres (1).ipynb',
 'python+with+postgres.ipynb',
 'Python.docx',
 'R-3.3.2-win.exe',
 'R-3.3.3-win.exe',
 'RCertificationExam.pdf',
 'reg+model.ipynb',
 'Revision -  Business Analytics (1).pdf',
 'Revision -  Business Analytics.pdf',
 'RidingMowers.csv',
 'rsconnect',
 'RStudio-1.0.136.exe',
 'Salary Slip, Feb 2016.pdf',
 'Salary Slip, Jan 2016.pdf',
 'Salary Slip, March 2016 (1).pdf',
 'Salary Slip, March 2016 (2).pdf',
 'Salary Slip, March 2016.pdf',
 'sales-of-shampoo-over-a-three-ye.csv',
 'SAS part 2.pdf',
 'SAS Part 3.pdf',
 'sas-university-edition-107140.pdf',
 'Scan0095.pdf',
 'Scanned Invoice for Collabera.pdf',
 'Screenshot 2017-01-23 12.36.55.png',
 'September invoice adaptive analytics   - Sheet1.pdf',
 'simple+matplot+graph.ipynb',
 'Sollers January.pdf',
 'sqlalchemy.ipynb',
 'stackoverflow-dump-analysis.html',
 'Sunstone.pdf',
 'Tableau.pdf',
 'TableauPublicDesktop-64bit-10-1-3.exe',
 'TableauPublicDesktop-64bit-10-1-4.exe',
 'telecom.csv',
 'TelecomServiceProviderCaseStudy.pdf',
 'test+web+scraping.ipynb',
 'Text Mining (1).pdf',
 'Text Mining.pdf',
 'third.sas7bdat',
 'Time Series  Forecasting (1).pdf',
 'Time Series  Forecasting.pdf',
 'ts.html',
 'ts.R',
 'Unconfirmed 373974.crdownload',
 'Unconfirmed 376991.crdownload',
 'Unconfirmed 950045.crdownload',
 'uTorrent.exe',
 'VirtualBox-5.1.8-111374-Win (1).exe',
 'VirtualBox-5.1.8-111374-Win.exe',
 'visualcppbuildtools_full.exe',
 'Web+Scraping+Yelp+with+Beautiful+Soup.ipynb',
 'Webinar for Business Analytics.pdf',
 'WhatsApp Image 2017-02-18 at 08.42.55 (1).jpeg',
 'WhatsApp Image 2017-02-18 at 08.42.55.jpeg']

In [85]:
def sinplot(flip=1):
    x = np.linspace(0, 14, 100)
    for i in range(1, 7):
        plt.plot(x, np.sin(x + i * .5) * (7 - i) * flip)

In [86]:
sinplot()



In [87]:
sns.set_style("white")

In [88]:
sinplot()



In [89]:
sns.set_style("ticks")
sinplot()



In [91]:
sns.palplot(sns.color_palette())



In [92]:
sns.palplot(sns.color_palette("hls",8))



In [93]:
sns.palplot(sns.color_palette("BuGn", 10))



In [94]:
sinplot()



In [95]:
diamonds=pd.read_csv("C:\\Users\\Dell\\Downloads\\BigDiamonds.csv\\BigDiamonds.csv")

In [96]:
type(diamonds)


Out[96]:
pandas.core.frame.DataFrame

In [97]:
len(diamonds)


Out[97]:
598024

In [98]:
diamonds.columns


Out[98]:
Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'table', 'depth',
       'cert', 'measurements', 'price', 'x', 'y', 'z'],
      dtype='object')

In [99]:
diamonds.shape


Out[99]:
(598024, 13)

In [15]:
diamonds.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598024 entries, 0 to 598023
Data columns (total 13 columns):
Unnamed: 0      598024 non-null int64
carat           598024 non-null float64
cut             598024 non-null object
color           598024 non-null object
clarity         598024 non-null object
table           598024 non-null float64
depth           598024 non-null float64
cert            598024 non-null object
measurements    597978 non-null object
price           597311 non-null float64
x               596209 non-null float64
y               596172 non-null float64
z               595480 non-null float64
dtypes: float64(7), int64(1), object(5)
memory usage: 59.3+ MB

In [100]:
diamonds.head()


Out[100]:
Unnamed: 0 carat cut color clarity table depth cert measurements price x y z
0 1 0.25 V.Good K I1 59.0 63.7 GIA 3.96 x 3.95 x 2.52 NaN 3.96 3.95 2.52
1 2 0.23 Good G I1 61.0 58.1 GIA 4.00 x 4.05 x 2.30 NaN 4.00 4.05 2.30
2 3 0.34 Good J I2 58.0 58.7 GIA 4.56 x 4.53 x 2.67 NaN 4.56 4.53 2.67
3 4 0.21 V.Good D I1 60.0 60.6 GIA 3.80 x 3.82 x 2.31 NaN 3.80 3.82 2.31
4 5 0.31 V.Good K I1 59.0 62.2 EGL 4.35 x 4.26 x 2.68 NaN 4.35 4.26 2.68

In [101]:
diamonds2=diamonds.copy()

In [102]:
pd.value_counts(diamonds2.cut)


Out[102]:
Ideal     369448
V.Good    168896
Good       59680
Name: cut, dtype: int64

In [103]:
diamonds.describe()


C:\Users\Dell\Anaconda3\lib\site-packages\numpy\lib\function_base.py:3834: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[103]:
Unnamed: 0 carat table depth price x y z
count 598024.000000 598024.000000 598024.000000 598024.000000 597311.000000 596209.000000 596172.000000 595480.000000
mean 299012.500000 1.071297 57.631077 61.063683 8753.017974 5.990771 6.198671 4.033430
std 172634.803028 0.812696 4.996892 7.604342 13017.567760 1.530936 1.485891 1.240951
min 1.000000 0.200000 0.000000 0.000000 300.000000 0.150000 1.000000 0.040000
25% 149506.750000 0.500000 56.000000 61.000000 NaN NaN NaN NaN
50% 299012.500000 0.900000 58.000000 62.100000 NaN NaN NaN NaN
75% 448518.250000 1.500000 59.000000 62.700000 NaN NaN NaN NaN
max 598024.000000 9.250000 75.900000 81.300000 99990.000000 13.890000 13.890000 13.180000

In [107]:
diamonds=diamonds.drop("Unnamed: 0",1)

In [105]:
diamonds=diamonds.dropna(how="any")

In [108]:
sns.distplot(diamonds.price, bins=20, kde=True, rug=False)


C:\Users\Dell\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[108]:
<matplotlib.axes._subplots.AxesSubplot at 0x201290b8>

In [110]:
sns.distplot(diamonds.price, bins=20, kde=False, rug=False)


Out[110]:
<matplotlib.axes._subplots.AxesSubplot at 0xb4632b0>

In [111]:
sns.boxplot(x="color", y="price", data=diamonds)


Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0xb521a90>

In [112]:
sns.jointplot('price','carat',data=diamonds2)


Out[112]:
<seaborn.axisgrid.JointGrid at 0x19f420b8>

In [113]:
sns.factorplot(x="color", y="price",
col="cut", data=diamonds, kind="box", size=4, aspect=.5);



In [114]:
from ggplot import *

In [116]:
p + geom_point()


Out[116]:
<ggplot: (27091425)>

In [117]:
p + geom_point() +facet_grid('cut')


Out[117]:
<ggplot: (27091425)>

In [118]:
p = ggplot(aes(x='price', y='carat',color="cut"), data=diamonds)
p + geom_point()