Fake Data Creator



In [46]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets



In [6]:

    
from faker import Factory,Faker



In [12]:

    
fake = Faker()
fake.seed(101)



In [13]:

    
fake.company()









    Out[13]:





'Boone, Allen and Jones'



In [15]:

    
fake.address()









    Out[15]:





'3167 Camacho Run Suite 466\nSouth Randy, SC 07381-5055'



In [41]:

    
print(fake.credit_card_full().split('\n'))









    



['Voyager', 'Katelyn Middleton', '869983872352072 06/24', 'CVC: 347', '']



In [155]:

    
X,y = datasets.make_regression( n_samples=500,n_features=4,n_informative=3,bias=1,noise=10)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X,y],axis=1)
df.columns = ['Avg. Session Length','Time on App','Time on Website','Length of Membership','Yearly Amount Spent']
df.head()









    Out[155]:






  
    
      
      Avg. Session Length
      Time on App
      Time on Website
      Length of Membership
      Yearly Amount Spent
    
  
  
    
      0
      1.497268
      0.655651
      2.577668
      0.482621
      87.951054
    
    
      1
      -1.073728
      -0.890539
      0.268959
      -0.935966
      -107.795067
    
    
      2
      0.000915
      -0.669722
      0.110597
      0.504543
      -12.452495
    
    
      3
      1.305557
      1.717514
      -0.278717
      -0.479821
      81.852344
    
    
      4
      0.330673
      0.795189
      0.536653
      0.846308
      99.406092



In [156]:

    
df.describe()









    Out[156]:






  
    
      
      Avg. Session Length
      Time on App
      Time on Website
      Length of Membership
      Yearly Amount Spent
    
  
  
    
      count
      500.000000
      500.000000
      500.000000
      500.000000
      500.000000
    
    
      mean
      0.053194
      0.052488
      0.060445
      -0.066538
      -0.685962
    
    
      std
      0.992563
      0.994216
      1.010489
      0.999278
      79.314782
    
    
      min
      -3.467571
      -3.491848
      -3.086153
      -3.330099
      -243.329418
    
    
      25%
      -0.658178
      -0.611847
      -0.650743
      -0.669550
      -54.961723
    
    
      50%
      0.082008
      -0.016769
      0.069367
      -0.066025
      -1.112125
    
    
      75%
      0.711985
      0.753850
      0.716432
      0.526502
      49.313828
    
    
      max
      3.139662
      3.126994
      3.005182
      3.322689
      265.518462



In [157]:

    
plus = [33,12,37,3.6,500]
for i,col in enumerate(df.columns):
    df[col] = df[col].apply(lambda x: x+plus[i])



In [158]:

    
df.describe()









    Out[158]:






  
    
      
      Avg. Session Length
      Time on App
      Time on Website
      Length of Membership
      Yearly Amount Spent
    
  
  
    
      count
      500.000000
      500.000000
      500.000000
      500.000000
      500.000000
    
    
      mean
      33.053194
      12.052488
      37.060445
      3.533462
      499.314038
    
    
      std
      0.992563
      0.994216
      1.010489
      0.999278
      79.314782
    
    
      min
      29.532429
      8.508152
      33.913847
      0.269901
      256.670582
    
    
      25%
      32.341822
      11.388153
      36.349257
      2.930450
      445.038277
    
    
      50%
      33.082008
      11.983231
      37.069367
      3.533975
      498.887875
    
    
      75%
      33.711985
      12.753850
      37.716432
      4.126502
      549.313828
    
    
      max
      36.139662
      15.126994
      40.005182
      6.922689
      765.518462



In [ ]:



In [ ]:



In [ ]:



In [159]:

    
df.describe()









    Out[159]:






  
    
      
      Avg. Session Length
      Time on App
      Time on Website
      Length of Membership
      Yearly Amount Spent
    
  
  
    
      count
      500.000000
      500.000000
      500.000000
      500.000000
      500.000000
    
    
      mean
      33.053194
      12.052488
      37.060445
      3.533462
      499.314038
    
    
      std
      0.992563
      0.994216
      1.010489
      0.999278
      79.314782
    
    
      min
      29.532429
      8.508152
      33.913847
      0.269901
      256.670582
    
    
      25%
      32.341822
      11.388153
      36.349257
      2.930450
      445.038277
    
    
      50%
      33.082008
      11.983231
      37.069367
      3.533975
      498.887875
    
    
      75%
      33.711985
      12.753850
      37.716432
      4.126502
      549.313828
    
    
      max
      36.139662
      15.126994
      40.005182
      6.922689
      765.518462



In [160]:

    
df['Email'] = df['Time on App'].apply(lambda x : fake.email())
df['Address'] = df['Email'].apply(lambda x : fake.address())
df['Avatar'] = df['Email'].apply(lambda x : fake.color_name())



In [161]:

    
customers = df[['Email', 'Address','Avatar','Avg. Session Length', 'Time on App', 'Time on Website',
       'Length of Membership', 'Yearly Amount Spent']]



In [162]:

    
customers.to_csv('Ecommerce Customers',index=False)



In [114]:



In [133]:



In [132]:

    
df['Avg. Session Length'] = df['Avg. Session Length'].apply(lambda x : x+randint(0,100)*0.01)



In [121]:

    
from random import randint



In [131]:

    
randint(0,100)*0.1









    Out[131]:





10.0



In [ ]:



In [ ]:



In [ ]:



In [163]:

    
# FAKE HOUSING DATA
X,y = datasets.make_regression( n_samples=5000,n_features=5,n_informative=5,bias=3,noise=15)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X,y],axis=1)



In [169]:

    
df.columns = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price']



In [254]:

    
df['Address'] = df['Area Population'].apply(lambda x: f.address())



In [255]:

    
df.head()









    Out[255]:






  
    
      
      Avg. Area Income
      Avg. Area House Age
      Avg. Area Number of Rooms
      Avg. Area Number of Bedrooms
      Area Population
      Price
      Address
    
  
  
    
      0
      79545.458574
      5.682861
      7.009188
      4.09
      23086.800503
      1.059034e+06
      208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
    
    
      1
      79248.642455
      6.002900
      6.730821
      3.09
      40173.072174
      1.505891e+06
      188 Johnson Views Suite 079\nLake Kathleen, CA...
    
    
      2
      61287.067179
      5.865890
      8.512727
      5.13
      36882.159400
      1.058988e+06
      9127 Elizabeth Stravenue\nDanieltown, WI 06482...
    
    
      3
      63345.240046
      7.188236
      5.586729
      3.26
      34310.242831
      1.260617e+06
      USS Barnett\nFPO AP 44820
    
    
      4
      59982.197226
      5.040555
      7.839388
      4.23
      26354.109472
      6.309435e+05
      USNS Raymond\nFPO AE 09386



In [257]:

    
df.to_csv('CA_Housing.csv',index=False)



In [244]:

    
df['Price'] = df['Price'].apply(lambda x: x*2500 )



In [233]:

    
def fix(x):
    if x<=3:
        return 1
    elif x>3 and x<7:
        return randint(2,4)
    else:
        return randint(3,6)



In [ ]:

    
plus = [5,,37,3.6,500]
for i,col in enumerate(df.columns):
    df[col] = df[col].apply(lambda x: x+plus[i])



In [231]:

    
randint(2,4)









    Out[231]:





4



In [249]:

    
50*.01









    Out[249]:





0.5



In [253]:

    
from faker import Faker
f = Faker()



In [ ]:

    
f.address

	Avg. Session Length	Time on App	Time on Website	Length of Membership	Yearly Amount Spent
0	1.497268	0.655651	2.577668	0.482621	87.951054
1	-1.073728	-0.890539	0.268959	-0.935966	-107.795067
2	0.000915	-0.669722	0.110597	0.504543	-12.452495
3	1.305557	1.717514	-0.278717	-0.479821	81.852344
4	0.330673	0.795189	0.536653	0.846308	99.406092

	Avg. Session Length	Time on App	Time on Website	Length of Membership	Yearly Amount Spent
count	500.000000	500.000000	500.000000	500.000000	500.000000
mean	0.053194	0.052488	0.060445	-0.066538	-0.685962
std	0.992563	0.994216	1.010489	0.999278	79.314782
min	-3.467571	-3.491848	-3.086153	-3.330099	-243.329418
25%	-0.658178	-0.611847	-0.650743	-0.669550	-54.961723
50%	0.082008	-0.016769	0.069367	-0.066025	-1.112125
75%	0.711985	0.753850	0.716432	0.526502	49.313828
max	3.139662	3.126994	3.005182	3.322689	265.518462

	Avg. Area Income	Avg. Area House Age	Avg. Area Number of Rooms	Avg. Area Number of Bedrooms	Area Population	Price	Address
0	79545.458574	5.682861	7.009188	4.09	23086.800503	1.059034e+06	208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1	79248.642455	6.002900	6.730821	3.09	40173.072174	1.505891e+06	188 Johnson Views Suite 079\nLake Kathleen, CA...
2	61287.067179	5.865890	8.512727	5.13	36882.159400	1.058988e+06	9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3	63345.240046	7.188236	5.586729	3.26	34310.242831	1.260617e+06	USS Barnett\nFPO AP 44820
4	59982.197226	5.040555	7.839388	4.23	26354.109472	6.309435e+05	USNS Raymond\nFPO AE 09386