In [2]:

    
import numpy as np
import pandas as pd

construct a test dataframe

这里构造一个测试数据， T1~T7的列代表某个时间下的产品销售量 product_type列代表产品所属的类型，预设4个类型； product_name代表产品名



In [4]:

    
data_rang = 9
pr_type = ['a', 'b', 'c', 'd']
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]
data = {'product_name' : ['x0', 'x1', 'x3', 'x2', 'x4', 'x5', 'x6', 'x7', 'x8'],
       'T1': np.random.randint(100, size = [data_rang]),
       'T2': np.random.randint(100, size = [data_rang]),
       'T3': np.random.randint(100, size = [data_rang]),
       'T4': np.random.randint(100, size = [data_rang]),
       'T5': np.random.randint(100, size = [data_rang]),
       'T6': np.random.randint(100, size = [data_rang]),
       'T7': np.random.randint(100, size = [data_rang]),
       'product_type': p_type}

test_data = pd.DataFrame(data, columns = ['product_name', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'product_type'])



In [5]:

    
print test_data









    



  product_name  T1  T2  T3  T4  T5  T6  T7 product_type
0           x0  40   1  64   3  64  85  28            b
1           x1  94  19  75  85  22   5  96            b
2           x3  65  83  10  32  72  73   2            b
3           x2  71  62  85  84  94  11   5            d
4           x4  45  20  85  88  38  25   1            a
5           x5  92  66   9   7  53  91   7            b
6           x6  15  43   2  59   7  52   7            a
7           x7  98  12  66  92   2  46  70            a
8           x8  57   7  75  42  92   6  86            a

main function

dealing_data 把传入的数据根据产品类型做数据截取，截取后的数据为在某个指定时间段内，每个产品类的所有产品在这个时间段内的销售数据然后再统计该时间段内每个类产品的销售总量，并排序



In [6]:

    
def dealing_data(data, start_time, end_time):
    product_ty = set(data['product_type']) 
    result_df = pd.DataFrame()
    
    for item in product_ty:
        tmp_data = data[data['product_type'] == item]
        slice_data = slicing_data(tmp_data, start_time, end_time)
        columns_name = ['product_name', 'product_type']
        
        tmp_data = tmp_data.loc[:, columns_name]
        tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
        tmp_data['statistic'] = np.sum(slice_data, axis = 1)
        
        tmp_data = tmp_data.sort_values('statistic', ascending = False)
        
        tmp_data['rank'] = range(len(tmp_data))
        tmp_data['rank'] += 1
        
        result_df = result_df.append(tmp_data)
    print result_df
    return result_df



In [7]:

    
def slicing_data(data, start_time, end_time):
    #select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
    #print data
    #print "***********",data[select_column]
    return data.loc[:, start_time : end_time]
    #return data[select_column]

查询函数，通过输入指定产品名和起止时间参数，返回该产品在该类中的销售排名



In [8]:

    
def query_rank(data, query_product_name, start_time, end_time):
    re_data = dealing_data(data, start_time, end_time)
    result = re_data[re_data['product_name'] == query_product_name]['rank'].values
    
    return result[0]



In [9]:

    
#查询 产品名为 x6 在 T1到T4这段时间内在同类产品中的销售排名
result_rank = query_rank(test_data, 'x6',  'T1', 'T3')

print "query result , the rank is %d"%result_rank

# 返回总的销售排名表









    



  product_name product_type  statistic  rank
7           x7            a        176     1
4           x4            a        150     2
8           x8            a        139     3
6           x6            a         60     4
1           x1            b        188     1
5           x5            b        167     2
2           x3            b        158     3
0           x0            b        105     4
3           x2            d        218     1
query result , the rank is 4

another form of datafram



In [10]:

    
def dealing_data_b(data, start_time, end_time):
    product_ty = set(data['product_type']) 
    result_df = pd.DataFrame()
    
    for item in product_ty:
        tmp_data = data[data['product_type'] == item]
        slice_data = slicing_data_b(tmp_data, start_time, end_time)
        columns_name = ['product_name', 'product_type']
        
        tmp_data = tmp_data.loc[:, columns_name]
        tmp_data['statistic'] = np.zeros(np.array(tmp_data).shape[0])
        tmp_data['statistic'] = np.sum(slice_data, axis = 1)
        
        tmp_data = tmp_data.sort_values('statistic', ascending = False)
        
        tmp_data['rank'] = range(len(tmp_data))
        tmp_data['rank'] += 1
        
        result_df = result_df.append(tmp_data)
    print result_df
    return result_df



In [11]:

    
def slicing_data_b(data, start_time, end_time):
    #select_column = [pd.to_datetime(start_time), pd.to_datetime(end_time)]
    select_columns = [ it for it in pd.date_range(start_time, end_time)]
    #print data
    #print "***********",data[select_column]
    #return data.loc[:, start_time : end_time]
    return data[select_columns]



In [12]:

    
def query_rank_b(data, query_product_name, start_time, end_time):
    re_data = dealing_data_b(data, start_time, end_time)
    result = re_data[re_data['product_name'] == query_product_name]['rank'].values
    
    return result[0]



In [13]:

    
# construct dataframe
sale_value_date = { el: np.random.randint(100, size = [data_rang]) for el in pd.date_range('20100101', '20100109')}
sale_value_date_df = pd.DataFrame(sale_value_date)

print sale_value_date_df









    



   2010-01-01  2010-01-02  2010-01-03  2010-01-04  2010-01-05  2010-01-06  \
0          72          62          27          47          97           8   
1          80          56          28          69          63          25   
2          12          70          73          31          48          19   
3          70          12          96          21          90           8   
4          84          85          62          38          32           6   
5          31          21          65          89          57          98   
6          35          68          94          98          63          12   
7          78          15           6           5          46          30   
8          32          66          11          96          12          87   

   2010-01-07  2010-01-08  2010-01-09  
0          53          18          27  
1          43          59          14  
2          53          49          38  
3           9          87          73  
4          90          49          43  
5           1          20          86  
6          77           2          65  
7          63           7          39  
8          30          24          96



In [14]:

    
pr_type = ['a', 'b', 'c', 'd']
p_name = [ "x" + str(i) for i in range(data_rang) ]
p_type = [ np.random.choice(pr_type) for i in range(data_rang) ]

product_data = {'product_name': p_name,
               'product_type': p_type}

product_data_df = pd.DataFrame(product_data, columns = ['product_name', 'product_type'])

print product_data_df









    



  product_name product_type
0           x0            c
1           x1            d
2           x2            b
3           x3            d
4           x4            d
5           x5            c
6           x6            b
7           x7            a
8           x8            a



In [15]:

    
df = pd.concat([sale_value_date_df, product_data_df], axis = 1)

print df









    



   2010-01-01 00:00:00  2010-01-02 00:00:00  2010-01-03 00:00:00  \
0                   72                   62                   27   
1                   80                   56                   28   
2                   12                   70                   73   
3                   70                   12                   96   
4                   84                   85                   62   
5                   31                   21                   65   
6                   35                   68                   94   
7                   78                   15                    6   
8                   32                   66                   11   

   2010-01-04 00:00:00  2010-01-05 00:00:00  2010-01-06 00:00:00  \
0                   47                   97                    8   
1                   69                   63                   25   
2                   31                   48                   19   
3                   21                   90                    8   
4                   38                   32                    6   
5                   89                   57                   98   
6                   98                   63                   12   
7                    5                   46                   30   
8                   96                   12                   87   

   2010-01-07 00:00:00  2010-01-08 00:00:00  2010-01-09 00:00:00 product_name  \
0                   53                   18                   27           x0   
1                   43                   59                   14           x1   
2                   53                   49                   38           x2   
3                    9                   87                   73           x3   
4                   90                   49                   43           x4   
5                    1                   20                   86           x5   
6                   77                    2                   65           x6   
7                   63                    7                   39           x7   
8                   30                   24                   96           x8   

  product_type  
0            c  
1            d  
2            b  
3            d  
4            d  
5            c  
6            b  
7            a  
8            a



In [17]:

    
result_rank_b = query_rank_b(df, 'x4', '2010-01-01', '2010-01-05')
print "query result , the rank is %d"%result_rank_b









    



  product_name product_type  statistic  rank
8           x8            a        217     1
7           x7            a        150     2
0           x0            c        305     1
5           x5            c        263     2
6           x6            b        358     1
2           x2            b        234     2
4           x4            d        301     1
1           x1            d        296     2
3           x3            d        289     3
query result , the rank is 1



In [ ]:

construct a test dataframe

main function

查询函数， 通过输入指定产品名和起止时间参数， 返回该产品在该类中的销售排名

another form of datafram

查询函数，通过输入指定产品名和起止时间参数，返回该产品在该类中的销售排名