In [2]:
import numpy as np
import pandas as pd
这里构造一个测试数据, T1~T7的列代表某个时间下的产品销售量 product_type列代表产品所属的类型,预设4个类型; product_name代表产品名
In [4]:
data_rang = 9
pr_type = ['a', 'b', 'c', 'd']
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]
data = {'product_name' : ['x0', 'x1', 'x3', 'x2', 'x4', 'x5', 'x6', 'x7', 'x8'],
'T1': np.random.randint(100, size = [data_rang]),
'T2': np.random.randint(100, size = [data_rang]),
'T3': np.random.randint(100, size = [data_rang]),
'T4': np.random.randint(100, size = [data_rang]),
'T5': np.random.randint(100, size = [data_rang]),
'T6': np.random.randint(100, size = [data_rang]),
'T7': np.random.randint(100, size = [data_rang]),
'product_type': p_type}
test_data = pd.DataFrame(data, columns = ['product_name', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'product_type'])
In [5]:
print test_data
dealing_data 把传入的数据根据产品类型做数据截取,截取后的数据为在某个指定时间段内,每个产品类的所有产品在这个时间段内的销售数据 然后再统计该时间段内每个类产品的销售总量,并排序
In [6]:
def dealing_data(data, start_time, end_time):
product_ty = set(data['product_type'])
result_df = pd.DataFrame()
for item in product_ty:
tmp_data = data[data['product_type'] == item]
slice_data = slicing_data(tmp_data, start_time, end_time)
columns_name = ['product_name', 'product_type']
tmp_data = tmp_data.loc[:, columns_name]
tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
tmp_data['statistic'] = np.sum(slice_data, axis = 1)
tmp_data = tmp_data.sort_values('statistic', ascending = False)
tmp_data['rank'] = range(len(tmp_data))
tmp_data['rank'] += 1
result_df = result_df.append(tmp_data)
print result_df
return result_df
In [7]:
def slicing_data(data, start_time, end_time):
#select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
#print data
#print "***********",data[select_column]
return data.loc[:, start_time : end_time]
#return data[select_column]
In [8]:
def query_rank(data, query_product_name, start_time, end_time):
re_data = dealing_data(data, start_time, end_time)
result = re_data[re_data['product_name'] == query_product_name]['rank'].values
return result[0]
In [9]:
#查询 产品名为 x6 在 T1到T4这段时间内在同类产品中的销售排名
result_rank = query_rank(test_data, 'x6', 'T1', 'T3')
print "query result , the rank is %d"%result_rank
# 返回总的销售排名表
In [10]:
def dealing_data_b(data, start_time, end_time):
product_ty = set(data['product_type'])
result_df = pd.DataFrame()
for item in product_ty:
tmp_data = data[data['product_type'] == item]
slice_data = slicing_data_b(tmp_data, start_time, end_time)
columns_name = ['product_name', 'product_type']
tmp_data = tmp_data.loc[:, columns_name]
tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
tmp_data['statistic'] = np.sum(slice_data, axis = 1)
tmp_data = tmp_data.sort_values('statistic', ascending = False)
tmp_data['rank'] = range(len(tmp_data))
tmp_data['rank'] += 1
result_df = result_df.append(tmp_data)
print result_df
return result_df
In [11]:
def slicing_data_b(data, start_time, end_time):
#select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
select_columns = [ it for it in pd.date_range(start_time, end_time)]
#print data
#print "***********",data[select_column]
#return data.loc[:, start_time : end_time]
return data[select_columns]
In [12]:
def query_rank_b(data, query_product_name, start_time, end_time):
re_data = dealing_data_b(data, start_time, end_time)
result = re_data[re_data['product_name'] == query_product_name]['rank'].values
return result[0]
In [13]:
# construct dataframe
sale_value_date = { el: np.random.randint(100, size = [data_rang]) for el in pd.date_range('20100101', '20100109')}
sale_value_date_df = pd.DataFrame(sale_value_date)
print sale_value_date_df
In [14]:
pr_type = ['a', 'b', 'c', 'd']
p_name = [ "x" + str(i) for i in range(data_rang) ]
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]
product_data = {'product_name': p_name,
'product_type': p_type}
product_data_df = pd.DataFrame(product_data, columns = ['product_name', 'product_type'])
print product_data_df
In [15]:
df = pd.concat([sale_value_date_df, product_data_df], axis = 1)
print df
In [17]:
result_rank_b = query_rank_b(df, 'x4', '2010-01-01', '2010-01-05')
print "query result , the rank is %d"%result_rank_b
In [ ]: