In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import re
import reprlib
from collections import abc

np.random.seed(1445)

In [2]:
# ========================================diff 函数
df = pd.DataFrame({'AAA' : [1,2,3,4,5,6,7,9,9,11,11,11], 'BBB' : [1,0,1,1,0,1,0,1,1,1,1,1], 'CCC' : [3,3,3,3,3,3,3,4,4,4,4,4]})
# df.query('BBB ==0 and AAA in (2,5)')
ss = df.groupby(['CCC'])#.AAA.apply(set).apply(len)
x = df.apply(max)
for i in x:
    print(i)
# isinstance(ss, abc.Iterable)
# ss.sort_values(ascending=False)
# ss_a = ss['AAA']

# print(ss_a.unique())
# source_cols = df.columns
# str_col = [str(x) + '_col' for x in source_cols]
# categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' }
# df[str_col] = df[source_cols].applymap(categories.get)
# df.groupby(['CCC'])['AAA'].mean()
# df.sort_values('CCC')#.reset_index(drop=True)#.groupby(['CCC'])#.diff()
# data = pd.DataFrame(np.random.rand(20,4), columns = ['A','B','C','D'])
# data.query('A > B')


11
1
4

取出有变化的行操作


In [3]:
type(df.groupby(['CCC']).BBB)


Out[3]:
pandas.core.groupby.SeriesGroupBy

In [7]:
df_ll = df.loc[(df.sort_values('AAA').groupby(['CCC']).BBB.diff() !=0),:]
df_ll.reset_index(drop=True)#,df.sort_values('AAA').groupby(['CCC']).BBB.diff() !=0
df


Out[7]:
AAA BBB CCC
0 1 1 3
1 2 0 3
2 3 1 3
3 4 1 3
4 5 0 3
5 6 1 3
6 7 0 3
7 9 1 4
8 9 1 4
9 11 1 4
10 11 1 4
11 11 1 4

In [58]:
df.loc[df.sort_values('AAA').groupby(['CCC']).BBB.diff() !=0, :]


Out[58]:
AAA BBB CCC
0 1 1 3
1 2 0 3
2 3 1 3
4 5 0 3
5 6 1 3
6 7 0 3
7 9 1 4

In [2]:
mean = df.groupby(['CCC', 'BBB'])['AAA'].apply(lambda x: '%.2f' % x).apply(set).apply(list)
mean
# df.loc[:, ['AAA', 'BBB']]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-c5d1f872956a> in <module>()
----> 1 mean = df.groupby(['CCC', 'BBB'])['AAA'].apply(lambda x: '%.2f' % x).apply(set).apply(list)
      2 mean
      3 # df.loc[:, ['AAA', 'BBB']]

NameError: name 'df' is not defined

In [104]:
np.random.seed(1445)
df_np = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'],index=pd.date_range('1/1/2000', periods=10));df_np


Out[104]:
A B C
2000-01-01 -1.060171 -0.036052 -1.492735
2000-01-02 0.433475 -1.367344 0.465158
2000-01-03 -1.690195 -0.838451 0.323225
2000-01-04 -0.397144 3.080562 2.463100
2000-01-05 0.981047 1.074076 0.231936
2000-01-06 1.100334 1.171540 0.245949
2000-01-07 1.693705 -0.379755 -1.581115
2000-01-08 1.836124 0.700274 -1.215588
2000-01-09 1.123545 -1.470417 -2.324812
2000-01-10 2.456664 1.744990 1.139921

In [7]:
df_np.iloc[0:6].agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}).apply(lambda x: x)


Out[7]:
A B
max NaN 3.080562
min -1.690195 -1.367344
sum -0.632654 NaN

In [38]:
df_w = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
# df_w.A = df_w.A.mask(df_w.A == 0, df.B)
df_w


Out[38]:
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9

In [56]:
a = np.arange(10).reshape(-1,2)
b = np.arange(10, 20)
a,b


Out[56]:
(array([[0, 1],
        [2, 3],
        [4, 5],
        [6, 7],
        [8, 9]]), array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19]))

In [8]:
df_w.loc[df_w.A>0, 'B'] = 1;df_w


Out[8]:
A B
0 0 1
1 2 1
2 4 1
3 6 1
4 8 1

In [159]:
df1 = pd.DataFrame(np.random.randn(10,1),columns=['A'],index = np.arange(0,10,1))
df2 = pd.DataFrame(np.random.randn(10,1),columns=['B'],index = np.arange(0,10,1))
df_concat = pd.concat([df1, df2], axis=1);df_concat
df_concat.plot.bar()
plt.show()



In [299]:
import matplotlib.pyplot as plt
from matplotlib.dates import date2num
import datetime

x = [datetime.datetime(2011, 1, 4, 0, 0),
     datetime.datetime(2011, 1, 5, 0, 0),
     datetime.datetime(2011, 1, 6, 0, 0)]
x = date2num(x)
y = [4, 9, 2]
z=[1,2,3]
k=[11,12,13]

ax = plt.subplot(111)
ax.bar(x-0.2, y,width=0.2,align='center')
ax.bar(x, z,width=0.2,align='center')
ax.bar(x+0.2, k,width=0.2,align='center')
ax.xaxis_date()
plt.show()

ax.axis()


Out[299]:
(734140.57000000007, 734143.42999999993, 0.0, 13.65)

In [51]:
df4 = pd.DataFrame({'E': ['E2', 'E3', 'B6', 'B7'],
                    'F': ['F2', 'F3', 'D6', 'D7'],
                    'G': ['G2', 'G3', 'F6', 'F7']},
                   index=[0, 1, 2, 3])
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
result = pd.concat([df1, df4], axis=1, join_axes=[df1.index])
# result.rename(columns={"A": "a", "B": "b"})
result.loc[:,['F', 'A']] = result[['A', 'F']].values
result


Out[51]:
A B C D E F G
0 F2 B0 C0 D0 E2 A0 G2
1 F3 B1 C1 D1 E3 A1 G3
2 D6 B2 C2 D2 B6 A2 F6
3 D7 B3 C3 D3 B7 A3 F7

In [112]:
# evenly sampled time at 200ms intervals
t = np.arange(0., 5., 0.2)

# red dashes, blue squares and green triangles
plt.plot(t, t, 'r--', t, t**2, 'cs', t, t**3, 'g^')
plt.show()



In [116]:
x = np.linspace(0, 2, 100)

plt.plot(x, x, label='linear')
plt.plot(x, x**2, label='quadratic')
plt.plot(x, x**3, label='cubic')

plt.xlabel('x label')
plt.ylabel('y label')

plt.title("Simple Plot")

plt.legend()

plt.show()



In [157]:
a = pd.DataFrame(np.random.rand(4,5), columns = list('abcde'))
a_a = a.values
# plt.plot(a_a, label='a')
plt.plot(a_a, marker='o')
# plt.legend()
plt.show()
# a_asndarray



In [4]:
pd.Series(['man_q3', 'man_q2', 'man_q1']).str.replace(r'^man_', '')
pd.Series({'a':1 , 'b': 2})


Out[4]:
a    1
b    2
dtype: int64

In [16]:
df_test = pd.DataFrame({'E': ['E2', 'E3', 'B6', 'B7'],
                    'F': ['F2', 'F3', 'D6', 'D7'],
                    'G': ['G2', 'G3', 'F6', 'F7']},
                   index=[0, 1, 2, 3])
df_test.applymap(lambda x, y : x+y , df_test.E, df_test.F)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-16-3c9034aee6c7> in <module>()
      3                     'G': ['G2', 'G3', 'F6', 'F7']},
      4                    index=[0, 1, 2, 3])
----> 5 df_test.applymap(lambda x, y : x+y , df_test.E, df_test.F)

TypeError: applymap() takes 2 positional arguments but 4 were given

In [2]:
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
m = df % 3 == 0
df


Out[2]:
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9

In [7]:
df2 = df.mask(m, -df)
df2


Out[7]:
A B
0 0 1
1 2 -3
2 4 5
3 -6 7
4 8 -9

In [8]:
df.append(df2)


Out[8]:
A B
0 0 1
1 2 3
2 4 5
3 6 7
4 8 9
0 0 1
1 2 -3
2 4 5
3 -6 7
4 8 -9