notebook.community

Edit and run



In [90]:

    
import pandas as pd
import numpy as np
data = {'BoolCol': [1, 2, 3, 3, 4],
        'attr': [22, 33, 22, 44, 66],
        'BoolC': [1, 2, 3, 3, 4],
        'att': [22, 33, 22, 44, 66],
        'Bool': [1, 2, 3, 3, 4]
        }
index= pd.Index(data=[1,2,3,4,5],name="index_new")
df=pd.DataFrame(data, index=index)

#df.index, df.index.name



In [91]:

    
import random
#随机生成3000个test号
#random.sample(range(0,10),6)从0-9这十位数中随机选出6位
test_list=[]
for i in range(3000):
    test_list.append("123456"+"".join(str(s) for s in random.sample(range(0,10),6)))
#生成3000个1-200的随机浮点数，且保留两位小数
test_list2 = [round(random.uniform(1,200),2) for i in range(3000)]
data = {
    'date':pd.date_range("2000",freq= 'D',periods=3000),
    'aa':test_list,
    'test2':test_list2,
    'label':[random.randint(0,1) for _ in range(3000)]
}
df_test = pd.DataFrame(data)

#date_1= pd.date_range("2000",freq= 'D',periods=3000).year



In [92]:

    
df_test.describe()









    Out[92]:







  
    
      
      test2
      label
    
  
  
    
      count
      3000.000000
      3000.000000
    
    
      mean
      100.821863
      0.512000
    
    
      std
      56.557216
      0.499939
    
    
      min
      1.140000
      0.000000
    
    
      25%
      52.022500
      0.000000
    
    
      50%
      101.300000
      1.000000
    
    
      75%
      150.087500
      1.000000
    
    
      max
      199.900000
      1.000000



In [93]:

    
date=df_test.pop('date')
df_test.insert(0,'date',date)
df_test.head(5)









    Out[93]:







  
    
      
      date
      aa
      test2
      label
    
  
  
    
      0
      2000-01-01
      123456253640
      51.40
      1
    
    
      1
      2000-01-02
      123456652438
      65.38
      1
    
    
      2
      2000-01-03
      123456549183
      35.39
      0
    
    
      3
      2000-01-04
      123456792183
      165.18
      1
    
    
      4
      2000-01-05
      123456785963
      36.03
      0



In [94]:

    
df_test[:2]









    Out[94]:







  
    
      
      date
      aa
      test2
      label
    
  
  
    
      0
      2000-01-01
      123456253640
      51.40
      1
    
    
      1
      2000-01-02
      123456652438
      65.38
      1



In [95]:

    
import numpy as np 
import pandas as pd 
from pandas import Series, DataFrame 
np.random.seed(666) 
df = pd.DataFrame(np.random.rand(25).reshape(5, 5), 
                  index=['A', 'B', 'D', 'E', 'F'], 
                  columns=['c1', 'c2', 'c3', 'c4', 'c5']) 
#print(df.shape) # (5, 5) # 返回前五行 
#df.head() # 返回后五行 
#df.tail() # 访问 某几个 列 
#print(df[['c1', 'c4']]) 
'''
         c1        c4
A  0.700437  0.727858
B  0.012703  0.099929
D  0.200248  0.700845
E  0.774479  0.110954
F  0.023236  0.197503
''' # 赋值于一个新的 dataframe 
sub_df = df[['c1', 'c3', 'c5']] 
'''
         c1        c3        c5
A  0.700437  0.676514  0.951458
B  0.012703  0.048813  0.508066
D  0.200248  0.192892  0.293228
E  0.774479  0.112858  0.247668
F  0.023236  0.340035  0.909180
''' # 查看前五行 
#print(sub_df.head(5)) 
'''
         c1        c3        c5
A  0.700437  0.676514  0.951458
B  0.012703  0.048813  0.508066
D  0.200248  0.192892  0.293228
E  0.774479  0.112858  0.247668
F  0.023236  0.340035  0.909180
''' # 查看中间 几行 的数据 使用 方法 iloc 
print(sub_df.iloc[1:3, :]) # iloc : index location  用索引定位， 前包含后不包含
'''
         c1        c3        c5
B  0.012703  0.048813  0.508066
D  0.200248  0.192892  0.293228
''' # 过滤 列 
print(sub_df.iloc[1:2, 0:2]) # 和python的用法一样，但是 该方法 是 基于 index 信息的 
'''
         c1        c3
B  0.012703  0.048813
''' # loc 方法， 通过label 名称来过滤 
print(sub_df.loc['A':'B', 'c1':'c5']) # 基于 label 选择 ， 包含前后









    



         c1        c3        c5
B  0.012703  0.048813  0.508066
D  0.200248  0.192892  0.293228
         c1        c3
B  0.012703  0.048813
         c1        c3        c5
A  0.700437  0.676514  0.951458
B  0.012703  0.048813  0.508066



In [96]:

    
df.iloc[[1,2,3]]



In [97]:

    
df.loc[["A","B","D"]]



In [98]:

    
print(df.loc["A","c1"])

print(df.at["A","c1"])

print(df.iat[1,1])









    



0.7004371218578347
0.7004371218578347
0.41358769878652346



In [ ]:

	test2	label
count	3000.000000	3000.000000
mean	100.821863	0.512000
std	56.557216	0.499939
min	1.140000	0.000000
25%	52.022500	0.000000
50%	101.300000	1.000000
75%	150.087500	1.000000
max	199.900000	1.000000

	date	aa	test2	label
0	2000-01-01	123456253640	51.40	1
1	2000-01-02	123456652438	65.38	1
2	2000-01-03	123456549183	35.39	0
3	2000-01-04	123456792183	165.18	1
4	2000-01-05	123456785963	36.03	0

	c1	c2	c3	c4	c5
B	0.012703	0.413588	0.048813	0.099929	0.508066
D	0.200248	0.744154	0.192892	0.700845	0.293228
E	0.774479	0.005109	0.112858	0.110954	0.247668

	c1	c2	c3	c4	c5
A	0.700437	0.844187	0.676514	0.727858	0.951458
B	0.012703	0.413588	0.048813	0.099929	0.508066
D	0.200248	0.744154	0.192892	0.700845	0.293228