In [2]:
import numpy as np
import pandas as pd

construct a test dataframe

这里构造一个测试数据, T1~T7的列代表某个时间下的产品销售量 product_type列代表产品所属的类型,预设4个类型; product_name代表产品名


In [4]:
data_rang = 9
pr_type = ['a', 'b', 'c', 'd']
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]
data = {'product_name' : ['x0', 'x1', 'x3', 'x2', 'x4', 'x5', 'x6', 'x7', 'x8'],
       'T1': np.random.randint(100, size = [data_rang]),
       'T2': np.random.randint(100, size = [data_rang]),
       'T3': np.random.randint(100, size = [data_rang]),
       'T4': np.random.randint(100, size = [data_rang]),
       'T5': np.random.randint(100, size = [data_rang]),
       'T6': np.random.randint(100, size = [data_rang]),
       'T7': np.random.randint(100, size = [data_rang]),
       'product_type': p_type}

test_data = pd.DataFrame(data, columns = ['product_name', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'product_type'])

In [5]:
print test_data


  product_name  T1  T2  T3  T4  T5  T6  T7 product_type
0           x0  40   1  64   3  64  85  28            b
1           x1  94  19  75  85  22   5  96            b
2           x3  65  83  10  32  72  73   2            b
3           x2  71  62  85  84  94  11   5            d
4           x4  45  20  85  88  38  25   1            a
5           x5  92  66   9   7  53  91   7            b
6           x6  15  43   2  59   7  52   7            a
7           x7  98  12  66  92   2  46  70            a
8           x8  57   7  75  42  92   6  86            a

main function

dealing_data 把传入的数据根据产品类型做数据截取,截取后的数据为在某个指定时间段内,每个产品类的所有产品在这个时间段内的销售数据 然后再统计该时间段内每个类产品的销售总量,并排序


In [6]:
def dealing_data(data, start_time, end_time):
    product_ty = set(data['product_type']) 
    result_df = pd.DataFrame()
    
    for item in product_ty:
        tmp_data = data[data['product_type'] == item]
        slice_data = slicing_data(tmp_data, start_time, end_time)
        columns_name = ['product_name', 'product_type']
        
        tmp_data = tmp_data.loc[:, columns_name]
        tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
        tmp_data['statistic'] = np.sum(slice_data, axis = 1)
        
        tmp_data = tmp_data.sort_values('statistic', ascending = False)
        
        tmp_data['rank'] = range(len(tmp_data))
        tmp_data['rank'] += 1
        
        result_df = result_df.append(tmp_data)
    print result_df
    return result_df

In [7]:
def slicing_data(data, start_time, end_time):
    #select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
    #print data
    #print "***********",data[select_column]
    return data.loc[:, start_time : end_time]
    #return data[select_column]

查询函数, 通过输入指定产品名和起止时间参数, 返回该产品在该类中的销售排名


In [8]:
def query_rank(data, query_product_name, start_time, end_time):
    re_data = dealing_data(data, start_time, end_time)
    result = re_data[re_data['product_name'] == query_product_name]['rank'].values
    
    return result[0]

In [9]:
#查询 产品名为 x6 在 T1到T4这段时间内在同类产品中的销售排名
result_rank = query_rank(test_data, 'x6',  'T1', 'T3')

print "query result , the rank is %d"%result_rank

# 返回总的销售排名表


  product_name product_type  statistic  rank
7           x7            a        176     1
4           x4            a        150     2
8           x8            a        139     3
6           x6            a         60     4
1           x1            b        188     1
5           x5            b        167     2
2           x3            b        158     3
0           x0            b        105     4
3           x2            d        218     1
query result , the rank is 4

another form of datafram


In [10]:
def dealing_data_b(data, start_time, end_time):
    product_ty = set(data['product_type']) 
    result_df = pd.DataFrame()
    
    for item in product_ty:
        tmp_data = data[data['product_type'] == item]
        slice_data = slicing_data_b(tmp_data, start_time, end_time)
        columns_name = ['product_name', 'product_type']
        
        tmp_data = tmp_data.loc[:, columns_name]
        tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
        tmp_data['statistic'] = np.sum(slice_data, axis = 1)
        
        tmp_data = tmp_data.sort_values('statistic', ascending = False)
        
        tmp_data['rank'] = range(len(tmp_data))
        tmp_data['rank'] += 1
        
        result_df = result_df.append(tmp_data)
    print result_df
    return result_df

In [11]:
def slicing_data_b(data, start_time, end_time):
    #select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
    select_columns = [ it for it in pd.date_range(start_time, end_time)]
    #print data
    #print "***********",data[select_column]
    #return data.loc[:, start_time : end_time]
    return data[select_columns]

In [12]:
def query_rank_b(data, query_product_name, start_time, end_time):
    re_data = dealing_data_b(data, start_time, end_time)
    result = re_data[re_data['product_name'] == query_product_name]['rank'].values
    
    return result[0]

In [13]:
# construct dataframe
sale_value_date = { el: np.random.randint(100, size = [data_rang]) for el in pd.date_range('20100101', '20100109')}
sale_value_date_df = pd.DataFrame(sale_value_date)

print sale_value_date_df


   2010-01-01  2010-01-02  2010-01-03  2010-01-04  2010-01-05  2010-01-06  \
0          72          62          27          47          97           8   
1          80          56          28          69          63          25   
2          12          70          73          31          48          19   
3          70          12          96          21          90           8   
4          84          85          62          38          32           6   
5          31          21          65          89          57          98   
6          35          68          94          98          63          12   
7          78          15           6           5          46          30   
8          32          66          11          96          12          87   

   2010-01-07  2010-01-08  2010-01-09  
0          53          18          27  
1          43          59          14  
2          53          49          38  
3           9          87          73  
4          90          49          43  
5           1          20          86  
6          77           2          65  
7          63           7          39  
8          30          24          96  

In [14]:
pr_type = ['a', 'b', 'c', 'd']
p_name = [ "x" + str(i) for i in range(data_rang) ]
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]

product_data = {'product_name': p_name,
               'product_type': p_type}

product_data_df = pd.DataFrame(product_data, columns = ['product_name', 'product_type'])

print product_data_df


  product_name product_type
0           x0            c
1           x1            d
2           x2            b
3           x3            d
4           x4            d
5           x5            c
6           x6            b
7           x7            a
8           x8            a

In [15]:
df = pd.concat([sale_value_date_df, product_data_df], axis = 1)

print df


   2010-01-01 00:00:00  2010-01-02 00:00:00  2010-01-03 00:00:00  \
0                   72                   62                   27   
1                   80                   56                   28   
2                   12                   70                   73   
3                   70                   12                   96   
4                   84                   85                   62   
5                   31                   21                   65   
6                   35                   68                   94   
7                   78                   15                    6   
8                   32                   66                   11   

   2010-01-04 00:00:00  2010-01-05 00:00:00  2010-01-06 00:00:00  \
0                   47                   97                    8   
1                   69                   63                   25   
2                   31                   48                   19   
3                   21                   90                    8   
4                   38                   32                    6   
5                   89                   57                   98   
6                   98                   63                   12   
7                    5                   46                   30   
8                   96                   12                   87   

   2010-01-07 00:00:00  2010-01-08 00:00:00  2010-01-09 00:00:00 product_name  \
0                   53                   18                   27           x0   
1                   43                   59                   14           x1   
2                   53                   49                   38           x2   
3                    9                   87                   73           x3   
4                   90                   49                   43           x4   
5                    1                   20                   86           x5   
6                   77                    2                   65           x6   
7                   63                    7                   39           x7   
8                   30                   24                   96           x8   

  product_type  
0            c  
1            d  
2            b  
3            d  
4            d  
5            c  
6            b  
7            a  
8            a  

In [17]:
result_rank_b = query_rank_b(df, 'x4', '2010-01-01', '2010-01-05')
print "query result , the rank is %d"%result_rank_b


  product_name product_type  statistic  rank
8           x8            a        217     1
7           x7            a        150     2
0           x0            c        305     1
5           x5            c        263     2
6           x6            b        358     1
2           x2            b        234     2
4           x4            d        301     1
1           x1            d        296     2
3           x3            d        289     3
query result , the rank is 1

In [ ]: