Fake Data Creator


In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets

In [6]:
from faker import Factory,Faker

In [12]:
fake = Faker()
fake.seed(101)

In [13]:
fake.company()


Out[13]:
'Boone, Allen and Jones'

In [15]:
fake.address()


Out[15]:
'3167 Camacho Run Suite 466\nSouth Randy, SC 07381-5055'

In [41]:
print(fake.credit_card_full().split('\n'))


['Voyager', 'Katelyn Middleton', '869983872352072 06/24', 'CVC: 347', '']

In [155]:
X,y = datasets.make_regression( n_samples=500,n_features=4,n_informative=3,bias=1,noise=10)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X,y],axis=1)
df.columns = ['Avg. Session Length','Time on App','Time on Website','Length of Membership','Yearly Amount Spent']
df.head()


Out[155]:
Avg. Session Length Time on App Time on Website Length of Membership Yearly Amount Spent
0 1.497268 0.655651 2.577668 0.482621 87.951054
1 -1.073728 -0.890539 0.268959 -0.935966 -107.795067
2 0.000915 -0.669722 0.110597 0.504543 -12.452495
3 1.305557 1.717514 -0.278717 -0.479821 81.852344
4 0.330673 0.795189 0.536653 0.846308 99.406092

In [156]:
df.describe()


Out[156]:
Avg. Session Length Time on App Time on Website Length of Membership Yearly Amount Spent
count 500.000000 500.000000 500.000000 500.000000 500.000000
mean 0.053194 0.052488 0.060445 -0.066538 -0.685962
std 0.992563 0.994216 1.010489 0.999278 79.314782
min -3.467571 -3.491848 -3.086153 -3.330099 -243.329418
25% -0.658178 -0.611847 -0.650743 -0.669550 -54.961723
50% 0.082008 -0.016769 0.069367 -0.066025 -1.112125
75% 0.711985 0.753850 0.716432 0.526502 49.313828
max 3.139662 3.126994 3.005182 3.322689 265.518462

In [157]:
plus = [33,12,37,3.6,500]
for i,col in enumerate(df.columns):
    df[col] = df[col].apply(lambda x: x+plus[i])

In [158]:
df.describe()


Out[158]:
Avg. Session Length Time on App Time on Website Length of Membership Yearly Amount Spent
count 500.000000 500.000000 500.000000 500.000000 500.000000
mean 33.053194 12.052488 37.060445 3.533462 499.314038
std 0.992563 0.994216 1.010489 0.999278 79.314782
min 29.532429 8.508152 33.913847 0.269901 256.670582
25% 32.341822 11.388153 36.349257 2.930450 445.038277
50% 33.082008 11.983231 37.069367 3.533975 498.887875
75% 33.711985 12.753850 37.716432 4.126502 549.313828
max 36.139662 15.126994 40.005182 6.922689 765.518462

In [ ]:


In [ ]:


In [ ]:


In [159]:
df.describe()


Out[159]:
Avg. Session Length Time on App Time on Website Length of Membership Yearly Amount Spent
count 500.000000 500.000000 500.000000 500.000000 500.000000
mean 33.053194 12.052488 37.060445 3.533462 499.314038
std 0.992563 0.994216 1.010489 0.999278 79.314782
min 29.532429 8.508152 33.913847 0.269901 256.670582
25% 32.341822 11.388153 36.349257 2.930450 445.038277
50% 33.082008 11.983231 37.069367 3.533975 498.887875
75% 33.711985 12.753850 37.716432 4.126502 549.313828
max 36.139662 15.126994 40.005182 6.922689 765.518462

In [160]:
df['Email'] = df['Time on App'].apply(lambda x : fake.email())
df['Address'] = df['Email'].apply(lambda x : fake.address())
df['Avatar'] = df['Email'].apply(lambda x : fake.color_name())

In [161]:
customers = df[['Email', 'Address','Avatar','Avg. Session Length', 'Time on App', 'Time on Website',
       'Length of Membership', 'Yearly Amount Spent']]

In [162]:
customers.to_csv('Ecommerce Customers',index=False)

In [114]:


In [133]:


In [132]:
df['Avg. Session Length'] = df['Avg. Session Length'].apply(lambda x : x+randint(0,100)*0.01)

In [121]:
from random import randint

In [131]:
randint(0,100)*0.1


Out[131]:
10.0

In [ ]:


In [ ]:


In [ ]:


In [163]:
# FAKE HOUSING DATA
X,y = datasets.make_regression( n_samples=5000,n_features=5,n_informative=5,bias=3,noise=15)
X = pd.DataFrame(X)
y = pd.DataFrame(y)
df = pd.concat([X,y],axis=1)

In [169]:
df.columns = ['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price']

In [254]:
df['Address'] = df['Area Population'].apply(lambda x: f.address())

In [255]:
df.head()


Out[255]:
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms Avg. Area Number of Bedrooms Area Population Price Address
0 79545.458574 5.682861 7.009188 4.09 23086.800503 1.059034e+06 208 Michael Ferry Apt. 674\nLaurabury, NE 3701...
1 79248.642455 6.002900 6.730821 3.09 40173.072174 1.505891e+06 188 Johnson Views Suite 079\nLake Kathleen, CA...
2 61287.067179 5.865890 8.512727 5.13 36882.159400 1.058988e+06 9127 Elizabeth Stravenue\nDanieltown, WI 06482...
3 63345.240046 7.188236 5.586729 3.26 34310.242831 1.260617e+06 USS Barnett\nFPO AP 44820
4 59982.197226 5.040555 7.839388 4.23 26354.109472 6.309435e+05 USNS Raymond\nFPO AE 09386

In [257]:
df.to_csv('CA_Housing.csv',index=False)

In [244]:
df['Price'] = df['Price'].apply(lambda x: x*2500 )

In [233]:
def fix(x):
    if x<=3:
        return 1
    elif x>3 and x<7:
        return randint(2,4)
    else:
        return randint(3,6)

In [ ]:
plus = [5,,37,3.6,500]
for i,col in enumerate(df.columns):
    df[col] = df[col].apply(lambda x: x+plus[i])

In [231]:
randint(2,4)


Out[231]:
4

In [249]:
50*.01


Out[249]:
0.5

In [253]:
from faker import Faker
f = Faker()

In [ ]:
f.address