python中的pandas功能
Python常见的pandas用法demo示例本文实例总结了Python常见的pandas用法。分享给大家供大家参考,具体如下:
|
import numpy as np import pandas as pd |
|
s = pd.Series([ 1 , 3 , 6 , np.nan, 44 , 1 ]) #定义一个序列。 序列就是一列内容,每一行有一个index值 print (s) print (s.index) |
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
RangeIndex(start=0, stop=6, step=1)
|
dates = pd.date_range( '20180101' , periods = 6 ) print (dates) |
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06'],
dtype='datetime64[ns]', freq='D')
|
df1 = pd.DataFrame(np.arange( 12 ).reshape( 3 , 4 )) #定义DataFrame,可以看作一个有index和colunms的矩阵 print (df) |
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
|
df2 = pd.DataFrame(np.random.randn( 6 , 4 ), index = dates, columns = [ 'a' , 'b' , 'c' , 'd' ]) #np.random.randn(6,4)生成6行4列矩阵 print (df) |
a b c d
2018-01-01 0.300675 1.769383 1.244406 -1.058294
2018-01-02 0.832666 2.216755 0.178716 -0.156828
2018-01-03 1.314190 -0.866199 0.836150 1.001026
2018-01-04 -1.671724 1.147406 -0.148676 -0.272555
2018-01-05 1.146664 2.022861 -1.833995 -0.627568
2018-01-06 -0.192242 1.517676 0.756707 0.058869
|
df = pd.DataFrame({ 'A' : 1.0 , 'B' :pd.Timestamp( '20180101' ), 'C' :pd.Series( 1 , index = list ( range ( 4 )), dtype = 'float32' ), 'D' :np.array([ 3 ] * 4 , dtype = 'int32' ), 'E' :pd.Categorical([ 'test' , 'train' , 'test' , 'train' ]), 'F' : 'foo' }) #按照给出的逐列定义df print (df) print (df.dtypes) |
A B C D E F
0 1.0 2018-01-01 1.0 3 test foo
1 1.0 2018-01-01 1.0 3 train foo
2 1.0 2018-01-01 1.0 3 test foo
3 1.0 2018-01-01 1.0 3 train foo
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
|
#df的行、列、值 print (df.index) print (df.columns) print (df.values) |
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'test' 'foo']
[1.0 Timestamp('2018-01-01 00:00:00') 1.0 3 'train' 'foo']]
|
print (df.describe()) #统计 print (df.T) #转置 |
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
0 1 2 \
A 1 1 1
B 2018-01-01 00:00:00 2018-01-01 00:00:00 2018-01-01 00:00:00
C 1 1 1
D 3 3 3
E test train test
F foo foo foo
3
A 1
B 2018-01-01 00:00:00
C 1
D 3
E train
F foo
|
#df排序 print (df.sort_index(axis = 1 , ascending = False )) #根据索引值对各行进行排序(相当于重新排列各列的位置) print (df.sort_values(by = 'E' )) #根据内容值对各列进行排序 |
F E D C B A
0 foo test 3 1.0 2018-01-01 1.0
1 foo train 3 1.0 2018-01-01 1.0
2 foo test 3 1.0 2018-01-01 1.0
3 foo train 3 1.0 2018-01-01 1.0
A B C D E F
0 1.0 2018-01-01 1.0 3 test foo
2 1.0 2018-01-01 1.0 3 test foo
1 1.0 2018-01-01 1.0 3 train foo
3 1.0 2018-01-01 1.0 3 train foo
|
indexes = pd.date_range( '20180101' , periods = 6 ) df3 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes, columns = [ 'A' , 'B' , 'C' , 'D' ]) print (df3) print () #选择column print (df3[ 'A' ]) print () print (df3.A) |
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
2018-01-01 0
2018-01-02 4
2018-01-03 8
2018-01-04 12
2018-01-05 16
2018-01-06 20
Freq: D, Name: A, dtype: int32
2018-01-01 0
2018-01-02 4
2018-01-03 8
2018-01-04 12
2018-01-05 16
2018-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
|
#选择行, 类似limit语句 print (df3[ 0 : 0 ]) print () print (df3[ 0 : 3 ]) print () print (df3[ '20180103' : '20180105' ]) |
Empty DataFrame
Columns: [A, B, C, D]
Index: []
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
A B C D
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
|
print (df3.loc[ '20180102' ]) #返回指定行构成的序列 |
A 4
B 5
C 6
D 7
Name: 2018-01-02 00:00:00, dtype: int32
|
print (df3.loc[ '20180103' , [ 'A' , 'C' ]]) #列筛选 print () print (df3.loc[ '20180103' : '20180105' , [ 'A' , 'C' ]]) #子df,类似select A, C from df limit ... print () print (df3.loc[:, [ 'A' , 'B' ]]) |
A 8
C 10
Name: 2018-01-03 00:00:00, dtype: int32
A C
2018-01-03 8 10
2018-01-04 12 14
2018-01-05 16 18
A B
2018-01-01 0 1
2018-01-02 4 5
2018-01-03 8 9
2018-01-04 12 13
2018-01-05 16 17
2018-01-06 20 21
|
print (df3); print () print (df3.iloc[ 1 ]); print () print (df3.iloc[ 1 , 1 ]); print () print (df3.iloc[:, 1 ]); print () print (df3.iloc[ 0 : 3 , 1 : 3 ]); print () print (df3.iloc[[ 1 , 3 , 5 ],[ 0 , 2 ]]) #行可以不连续,limit做不到 |
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A 4
B 5
C 6
D 7
Name: 2018-01-02 00:00:00, dtype: int32
5
2018-01-01 1
2018-01-02 5
2018-01-03 9
2018-01-04 13
2018-01-05 17
2018-01-06 21
Freq: D, Name: B, dtype: int32
B C
2018-01-01 1 2
2018-01-02 5 6
2018-01-03 9 10
A C
2018-01-02 4 6
2018-01-04 12 14
2018-01-06 20 22
|
# print(df3.ix[:3, ['A', 'C']])\ print (df3); print () print (df3[df3.A > = 8 ]) #根据值进行条件过滤,类似where A >= 8条件语句 |
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
|
indexes1 = pd.date_range( '20180101' , periods = 6 ) df4 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes1, columns = [ 'A' , 'B' , 'C' , 'D' ]) print (df4); print () #给某个元素赋值 df4.A[ 1 ] = 1111 df4.B[ '20180103' ] = 2222 df4.iloc[ 3 , 2 ] = 3333 df4.loc[ '20180105' , 'D' ] = 4444 print (df4); print () #范围赋值 df4.B[df4.A < 10 ] = - 1 print (df4); print () df4[df4.A < 10 ] = 0 print (df4); print () |
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 1 2 3
2018-01-02 1111 5 6 7
2018-01-03 8 2222 10 11
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 -1 2 3
2018-01-02 1111 5 6 7
2018-01-03 8 -1 10 11
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
A B C D
2018-01-01 0 0 0 0
2018-01-02 1111 5 6 7
2018-01-03 0 0 0 0
2018-01-04 12 13 3333 15
2018-01-05 16 17 18 4444
2018-01-06 20 21 22 23
|
indexes1 = pd.date_range( '20180101' , periods = 6 ) df4 = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = indexes1, columns = [ 'A' , 'B' , 'C' , 'D' ]) print (df4); print () #添加一列 df4[ 'E' ] = np.NaN print (df4); print () #由于index没对齐,原df没有的行默认为NaN,类型为float64,多出的行丢弃 df4[ 'F' ] = pd.Series([ 1 , 2 , 3 , 4 , 5 , 6 ], index = pd.date_range( '20180102' , periods = 6 )) print (df4); print () print (df4.dtypes) |
A B C D
2018-01-01 0 1 2 3
2018-01-02 4 5 6 7
2018-01-03 8 9 10 11
2018-01-04 12 13 14 15
2018-01-05 16 17 18 19
2018-01-06 20 21 22 23
A B C D E
2018-01-01 0 1 2 3 NaN
2018-01-02 4 5 6 7 NaN
2018-01-03 8 9 10 11 NaN
2018-01-04 12 13 14 15 NaN
2018-01-05 16 17 18 19 NaN
2018-01-06 20 21 22 23 NaN
A B C D E F
2018-01-01 0 1 2 3 NaN NaN
2018-01-02 4 5 6 7 NaN 1.0
2018-01-03 8 9 10 11 NaN 2.0
2018-01-04 12 13 14 15 NaN 3.0
2018-01-05 16 17 18 19 NaN 4.0
2018-01-06 20 21 22 23 NaN 5.0
A int32
B int32
C int32
D int32
E float64
F float64
dtype: object
|
df_t = pd.DataFrame(np.arange( 24 ).reshape( 6 , 4 ), index = [ 1 , 2 , 3 , 4 , 5 , 6 ], columns = [ 'A' , 'B' , 'C' , 'D' ]) df_t.iloc[ 0 , 1 ] = np.NaN df_t.iloc[ 1 , 2 ] = np.NaN df = df_t.copy() print (df); print () print (df.dropna(axis = 0 , how = 'any' )); print () df = df_t.copy() print (df.dropna(axis = 1 , how = 'any' )); print () df = df_t.copy() df.C = np.NaN print (df); print () print (df.dropna(axis = 1 , how = 'all' )); print () |
A B C D
1 0 NaN 2.0 3
2 4 5.0 NaN 7
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A B C D
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A D
1 0 3
2 4 7
3 8 11
4 12 15
5 16 19
6 20 23
A B C D
1 0 NaN NaN 3
2 4 5.0 NaN 7
3 8 9.0 NaN 11
4 12 13.0 NaN 15
5 16 17.0 NaN 19
6 20 21.0 NaN 23
A B D
1 0 NaN 3
2 4 5.0 7
3 8 9.0 11
4 12 13.0 15
5 16 17.0 19
6 20 21.0 23
|
df = df_t.copy() print (df); print () print (df.isna()); print () print (df.isnull(). any ()); print () #isnull是isna别名,功能一样 print (df.isnull(). any (axis = 1 )); print () print (np. any (df.isna() = = True )); print () print (df.fillna(value = 0 )) #将NaN赋值 |
A B C D
1 0 NaN 2.0 3
2 4 5.0 NaN 7
3 8 9.0 10.0 11
4 12 13.0 14.0 15
5 16 17.0 18.0 19
6 20 21.0 22.0 23
A B C D
1 False True False False
2 False False True False
3 False False