引入
from pandas import Series,DataFrame
import pandas as pd
import numpy as np
基本数据结构
Series
列表构建
s1 = Series([4,7,-5,3])
获取全部值
print(s1.values)
[ 4 7 -5 3]
获取全部索引
print(s1.index)
RangeIndex(start=0, stop=4, step=1)
自定义索引
s2 = Series([7,7,-5,3],index=["d","b","a","c"])
d 7 b 7 a -5 c 3 dtype: int64
获取单个值
print(s2["d"])
7
获取一组值
print(s2[["c","d","a"]])
c 3 d 7 a -5 dtype: int64
字典构建Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
sdata = Series(sdata)
以key为索引,以value为值,按索引有序排列
索引可以通过赋值方式修改
sdata.index = ["one","two","three","four"]
print(sdata.index)
Index([‘one’, ‘two’, ‘three’, ‘four’], dtype=’object’)
修改为以列表为索引
# 列表作为索引
states = ["California","abc","cba","def"]
sdata = Series(sdata,states)
print(sdata)
California NaN abc NaN cba NaN def NaN dtype: float64
在字典中没有对应key的value将会变为NaN空值
DataFrame
一种表格型数据结构,每一列可以是不同数据类型。
DataFrame既有行索引、也有列索引
DataFrame中的数据是以一个或多个二维块(有行有列)存放的,而不是列表、字典或别的一维数据结构
常用属性
- df.shape
- df.dtypes
- df.index
- df.columns
查看DataFrame所有属性及方法
print(dir(DataFrame))
可用于构建DataFrame的类型
- 二维ndarray
- 由数组、列表或元组组成的字典:每个序列会变成DataFrame的一-列。所有序列的长度必须相同
- NumPy的结构化/记录数组
- 由Series组成的字典
- 字典组成的字典
- ……
字典构建DataFrame
由等长列表或Numpy数组组成的字典
data = {'state': ['Ohio',' Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[ 2000, 2001,2002,2001, 2002],
'pop': [1.5,1.7, 3.6,2.4,2.9]}
data = DataFrame(data)
print(data)
state year pop 0 Ohio 2000 1.5 1 Ohio 2001 1.7 2 Ohio 2002 3.6 3 Nevada 2001 2.4 4 Nevada 2002 2.9
查看基本信息
print(data.info())
结果:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 state 5 non-null object
1 year 5 non-null int64
2 pop 5 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 248.0+ bytes
None
Process finished with exit code 0
读取一列数据的两种方式
print(data["state"])
print(data.state)
结果:
0 Ohio 1 Ohio 2 Ohio 3 Nevada 4 Nevada Name: state, dtype: object
每一列都是一个Series
列求和
print(data["pop"].sum())
结果:
12.100000000000001
筛选满足条件的数据
print(data.loc[data.state == "Ohio" ])
结果:
state year pop debt
0 Ohio 2000 1.5 16.5
2 Ohio 2002 3.6 16.5
计算最大值、最小值、平均值、标准差、方差
print(data.debt.mean()) # 平均值
print(data.debt.max()) # 最大值
print(data.debt.min()) # 最小值
print(data.debt.std()) # 标准差
print(data.debt.var()) # 方差
结果:
16.5
16.5
16.5
0.0
0.0
调整DataFrame列顺序
data.columns = ["year","state","pop","debt"]
print(data)
结果:
year state pop debt
0 Ohio 2000 1.5 16.5 1 Ohio 2001 1.7 16.5 2 Ohio 2002 3.6 16.5 3 Nevada 2001 2.4 16.5 4 Nevada 2002 2.9 16.5
通过赋值修改一列值
## 列可以通过赋值方式修改
import numpy as np
data["debt"] = 16.5
print(data)
data['debt'] = np.arange(5.) # 0-5 左闭右开
print(data)
结果:
year state pop debt
0 Ohio 2000 1.5 16.5
1 Ohio 2001 1.7 16.5
2 Ohio 2002 3.6 16.5
3 Nevada 2001 2.4 16.5
4 Nevada 2002 2.9 16.5
year state pop debt
0 Ohio 2000 1.5 0.0
1 Ohio 2001 1.7 1.0
2 Ohio 2002 3.6 2.0
3 Nevada 2001 2.4 3.0
4 Nevada 2002 2.9 4.0
Process finished with exit code 0
将列表或数组赋给某列长度必须与DataFrame匹配,
如果赋值的是一个Series,就会精确匹配索引、如果不匹配,会在所有的空位填上缺失值
## 赋值Series
# 自定义行索引
data = {'state': ['Ohio',' Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[ 2000, 2001,2002,2001, 2002],
'pop': [1.5,1.7, 3.6,2.4,2.9]}
data = DataFrame(data,index=["one","two","three","four","five"])
print(data)
val = Series([-1.2,-1.5,-1.7],index=["two","four","five"])
data["debt"] = val # 传入Series
print(data)
结果:
state year pop
one Ohio 2000 1.5
two Ohio 2001 1.7
three Ohio 2002 3.6
four Nevada 2001 2.4
five Nevada 2002 2.9
state year pop debt
one Ohio 2000 1.5 NaN
two Ohio 2001 1.7 -1.2
three Ohio 2002 3.6 NaN
four Nevada 2001 2.4 -1.5
five Nevada 2002 2.9 -1.7
Process finished with exit code 0
为不存在的列赋值会产生新列
删除一列
用del可以删除一列
## 删除一列
print(data)
del data["pop"]
print(data)
结果:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
state year
0 Ohio 2000
1 Ohio 2001
2 Ohio 2002
3 Nevada 2001
4 Nevada 2002
列之间通过逻辑运算产生新列
常用的逻辑运算符
- >、 <
- >=、<=
- !=、==
- and、or、not
字典嵌套构建DataFrame
dic = {
"Nevada":{2001:2.4,2002:2.9},
"0hio":{2000:1.5,2001:1.7,2002:3.6}
}
dic = DataFrame(dic)
print(dic)
结果:
Nevada 0hio
2001 2.4 1.7
2002 2.9 3.6
2000 NaN 1.
转置
行索引变成列索引
print(dic.T)
行索引与列索引命名
## 行索引与列索引命名
dic.index.name = "year"
dic.columns.name = "state"
print(dic)
结果:
state Nevada 0hio
year
2001 2.4 1.7
2002 2.9 3.6
2000 NaN 1.5
获取所有值
不包括行索引和列索引
## 获取所有值
print(dic.values)
结果:
[[2.4 1.7]
[2.9 3.6]
[nan 1.5]]
返回的是一个二维的ndarray
DataFrame.transpose()
DataFrame转置
Pandas进阶
Series与DataFrame中的索引
Series索引的选取和过滤
Series的索引有两种表示方式
- 数字索引
- 自定义索引
自定义索引可能不存在,但数字索引一定存在
## Series的索引
obj = Series(range(3),index=list("abc"))
print(obj)
结果:
a 0
b 1
c 2
dtype: int64
数字索引及自定义索引
## 选取索引
print(obj)
print(obj[1]) # 数字索引,从0开始,左闭右开
print(obj['b']) # 自定义索引
结果:
a 0
b 1
c 2
dtype: int64
1
1
切片操作
数字索引切片
print(obj.index[1:])
结果:
Index([‘b’, ‘c’], dtype=’object’)
Series的索引不可被修改
obj.index[1] = "d"
自定义索引切片
# 自定义索引
print(obj['a':'c'])
结果:
a 0
b 1
c 2
dtype: int64
a 0
b 1
c 2
dtype: int64
通过切片修改数据
# 通过切片修改数据
obj['a':'c']=10
print(obj)
结果:
a 0
b 1
c 2
dtype: int64
a 10
b 10
c 10
dtype: int64
Process finished with exit code 0
条件筛选
## 条件筛选
print(obj)
print(obj[obj<7])
结果:
a 0
b 1
c 2
dtype: int64
a 0
b 1
c 2
dtype: int64
索引的删除
删除行
删除索引,即删除该索引所对应的行数据
# 删除索引
data = DataFrame(np.arange(16).reshape(4,4), # 生成一个0-15的数组array,格式化为4行4列的矩阵
index=['Ohio', 'Colorado', 'Utah', 'New York'],# 列索引
columns=['one','two','three','four']) # 行索引
print(data)
# axis = 0 删除两行、原数据data不变
result = data.drop(["Ohio",'Colorado'],axis=0)
print(result)
print(data)
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two three four
Utah 8 9 10 11
New York 12 13 14 15
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
Process finished with exit code 0
删除列
## axis = 1 删除两列,原数据data不变
result = data.drop(["three",'four'],axis=1)
print(result)
print(data)
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two
Ohio 0 1
Colorado 4 5
Utah 8 9
New York 12 13
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
Process finished with exit code 0
inplace参数可以直接修改原数据
## inplace直接修改原数据
data.drop(["three",'four'],axis=1,inplace=True)
print(data)
结果:
one two
Ohio 0 1
Colorado 4 5
Utah 8 9
New York 12 13
Process finished with exit code 0
删除并返回一列
## 删除并返回一列
dd = data.pop("three")
print(dd)
print(data)
结果:
Ohio 2
Colorado 6
Utah 10
New York 14
Name: three, dtype: int32
one two four
Ohio 0 1 3
Colorado 4 5 7
Utah 8 9 11
New York 12 13 15
Process finished with exit code 0
DataFrame索引的选取和过滤
索引选取
选取一列
print(data["one"]) # 第一种
print(data.one) # 第二种
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
Ohio 0
Colorado 4
Utah 8
New York 12
Name: one, dtype: int32
返回结果类型
print(type(data["one"])) #一个中括号返回Series
print(type(data[["one"]])) #两个中括号返回DataFrame
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
返回多列
## 返回多列
print(data[["one","two"]])
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two
Ohio 0 1
Colorado 4 5
Utah 8 9
New York 12 13
Process finished with exit code 0
获取一行
使用DataFrame.loc()
格式:
loc[["行索引"],["列索引"]]
## 获取行
print(data.loc["New York"])
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one 12
two 13
three 14
four 15
Name: New York, dtype: int32
Process finished with exit code 0
获取多行
## 获取多行必须两个中括号,才能获取到DataFrame,否则是Series
print(type(data.loc["New York"]))
print(data.loc[["New York","New York"]])
print(type(data.loc[["New York","New York"]]))
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
<class 'pandas.core.series.Series'>
one two three four
New York 12 13 14 15
New York 12 13 14 15
<class 'pandas.core.frame.DataFrame'>
获取指定行和列的元素
数字索引
使用Data.iloc[行索引,列索引]
支持切片操作
print(data.iloc[2:4,2:4])
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
three four
Utah 10 11
New York 14 15
three four
Utah 10 11
New York 14 15
自定义索引
## 获取指定行指定列
print(data.loc[["New York","Utah"],["three","four"]])
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
three four
New York 14 15
Utah 10 11
可以直接赋值
## 选取并修改
data.iloc[2:4,2:4] = 100
print(data)
结果:
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 100 100
New York 12 13 100 100
pandas中的数据运算与算术对齐
Series对齐
不同相同索引的不同Series对象的元素可以进行运算,相同索引的元素经过运算后会合并
## Pandas数据运算与算术对齐
s1 = Series([7.3, -2.5, 3.4, 1.5],index=['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6,-1.5,4,3.1],index=['a', 'c', 'e', 'f', 'g'])
print(s1+s2)
结果:
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
DataFrame对齐
DataFrame中的对齐会发生在相同行索引和列索引的元素上
## DataFrame
df1 = DataFrame(np. arange(9.). reshape((3, 3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
print(df1)
df2 = DataFrame(np. arange(12.).reshape((4, 3)),columns=list('bde'),index=['Utah', 'Ohio','Texas','Oregon'])
print(df2)
print(df1+df2)
结果:
b c d
Ohio 0.0 1.0 2.0
Texas 3.0 4.0 5.0
Colorado 6.0 7.0 8.0
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
这种对齐默认其他为NaN
但如果调用DataFrame.add()方法,会先将第一个对象照第二个对象行列补齐,填充0,再相加
df1 = DataFrame(np.arange(12).reshape(3,4),columns=list("abcd"))
print(df1)
df2 = DataFrame(np.arange(20).reshape(4,5),columns=list("abcde"))
print(df2)
print(df1.add(df2,fill_value=0))
结果:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 11.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
或者先相加,再填充NaN为0
print(df1.add(df2).fillna(0))
结果:
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
a b c d e
0 0.0 2.0 4.0 6.0 0.0
1 9.0 11.0 13.0 15.0 0.0
2 18.0 20.0 22.0 24.0 0.0
3 0.0 0.0 0.0 0.0 0.0
*DataFrame与Series之间的运算
DataFrame与Series运算,会将Series中的索引匹配到DataFrame中的列进行运算,然后运算广播到每一行
## DataFrame与Series的运算
frame = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
series = frame.iloc[0]
print(frame - series)
结果:
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
*函数应用与映射
元素级数组方法
NumPy的ufuncs (元素级数组方法)也可用于操作pandas对象
## 函数应用和映射
frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
print(np.abs(frame))
结果:
b d e
Utah 0.886268 0.018775 1.143133
Ohio 1.037654 2.133544 0.845259
Texas 0.233974 0.705851 0.606051
Oregon 2.114198 0.230378 0.870002
列表级函数映射
用DataFrame.apply()将函数应用到每一行或每一列组成的一维数组上(列表级函数映射)
## 函数映射
f = lambda x:x.max() - x.min()
print(frame.apply(f)) # 默认每一列进行操作
print(frame.apply(f,axis=1)) # axis=1表示对行进行操作
结果:
b d e
Utah 1.108797 1.406696 1.636189
Ohio 0.046974 1.236916 2.155694
Texas 1.501698 1.416079 0.195249
Oregon 0.496316 0.288684 0.364140
b 2.610494
d 2.822775
e 3.791883
dtype: float64
Utah 2.744986
Ohio 3.392610
Texas 2.917777
Oregon 0.860456
dtype: float64
元素级函数映射
applymap()元素级函数映射
## 元素级
formate = lambda x:'%.2f' %x
print(frame.applymap(formate))
结果:
b d e
Utah 1.988918 0.882728 0.429667
Ohio 0.483167 0.307487 1.199268
Texas 0.111927 0.667282 0.095245
Oregon 0.884288 0.192735 0.084042
b d e
Utah 1.99 -0.88 0.43
Ohio 0.48 -0.31 -1.20
Texas -0.11 0.67 0.10
Oregon -0.88 0.19 -0.08
Series函数映射map()方法
## Series的元素级函数映射方法
print(frame["e"].map(formate))
结果:
b d e
Utah 0.372528 0.474377 1.868852
Ohio 0.027990 0.775611 0.573100
Texas 0.528759 0.773985 0.098583
Oregon 0.419916 1.513764 0.223223
Utah 1.87
Ohio 0.57
Texas -0.10
Oregon -0.22
Name: e, dtype: object
*排序与排名
对行或列按字典序进行排列,并返回已排序的新对象
import random
obj = Series([random.randint(0,100) for item in range(4)],index=['d','a','b','c'])
print(obj.sort_index())
结果:
a 47
b 4
c 15
d 3
dtype: int64
按值降序排列
print(obj.sort_values(ascending=False)) # 按值降序排列
结果:
a 30
b 59
c 31
d 89
dtype: int64
d 89
b 59
c 31
a 30
dtype: int64
列排序与降序排序的参数选项
DataFrame中的排序,按列排序axis=1,降序ascending
## DataFrame排序
frame = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'],columns=['d','a','b','c'])
print(frame.sort_index()) # 将行索引按字典序排列 axis=0
print(frame.sort_index(axis=1)) # 将列索引排序
结果:
d a b c
one 4 5 6 7
three 0 1 2 3
a b c d
three 1 2 3 0
one 5 6 7 4
一个或多个列中的值排序
使用by选项
## 一列或多列值排序
print(frame)
print(frame.sort_values(by='b')) # 按b排序
# 先按b排,一样的情况下按d排
print(frame.sort_values(by=['b','d'])) # 多列排序
结果:
b d e
Utah 0.964891 -0.502307 0.594309
Ohio 0.218862 1.232973 0.440679
Texas -0.572743 0.438073 0.842702
Oregon 1.159632 0.017506 0.493435
b d e
Texas -0.572743 0.438073 0.842702
Ohio 0.218862 1.232973 0.440679
Utah 0.964891 -0.502307 0.594309
Oregon 1.159632 0.017506 0.493435
b d e
Texas -0.572743 0.438073 0.842702
Ohio 0.218862 1.232973 0.440679
Utah 0.964891 -0.502307 0.594309
Oregon 1.159632 0.017506 0.493435
带有重复值的轴索引
快速判断索引是不是唯一的
## 带有重复值的轴索引
obj = Series(range(5),index=list('aabbc'))
print(obj)
print(obj.index.is_unique)
print(obj['a'])
print(obj['c'])
结果:
a 0
a 1
b 2
b 3
c 4
dtype: int64
False
a 0
a 1
dtype: int64
4
*汇总计算描述统计
用于Series以及DataFrame中求和
## 汇总求和
print(frame)
print(frame.sum()) # 列运算
print(frame.sum(axis=1)) # axis=1行运算
结果:
b d e
Utah -0.486037 -1.770174 1.490661
Ohio 0.061493 -0.648828 -0.590367
Texas -0.678654 -0.354049 0.782610
Oregon -0.049362 -0.862766 0.476923
b -1.152560
d -3.635817
e 2.159828
dtype: float64
Utah -0.765550
Ohio -1.177702
Texas -0.250093
Oregon -0.435204
dtype: float64
包含NaN运算
默认忽略nan值,也可以包含进来,参与运算
# 包含NaN值运算
frame.loc[["Texas"],['d']] = np.nan
print(frame)
print(frame.sum(skipna=False))
print(frame.sum(axis=1,skipna=False))
结果:
b d e
Utah 1.517026 0.217888 -1.107845
Ohio 0.564460 0.408671 -0.870724
Texas 0.222509 NaN -0.036652
Oregon 0.661925 -0.464794 0.748262
b 2.965920
d NaN
e -1.266958
dtype: float64
Utah 0.627069
Ohio 0.102408
Texas NaN
Oregon 0.945393
dtype: float64
累计求和
就是当前元素将前面所有元素的值累加。
一个简单的例子:
序号 | 元素 |
---|---|
0 | 0 |
1 | 1 |
2 | 2 |
3 | 3 |
4 | 4 |
5 | 5 |
6 | 6 |
7 | 7 |
将所有元素累加后
序号 | 累加后元素 |
---|---|
0 | 0+0=0 |
1 | 0+1=1 |
2 | 2+1=3 |
3 | 3+3=6 |
4 | 4+6=10 |
5 | 5+10=15 |
6 | 6+15=21 |
7 | 7+21=29 |
## 累计求和
frame = DataFrame(np.arange(8),index=list('abcdefgh'))
print(frame)
print(frame.cumsum())
结果:
0
a 0
b 1
c 2
d 3
e 4
f 5
g 6
h 7
0
a 0
b 1
c 3
d 6
e 10
f 15
g 21
h 28
非数值型也可以累加
## 非数值型累加
frame = DataFrame(["ABC","CBA","ABCBA"],index=["one","two","three"])
print(frame)
print(frame.cumsum())
结果:
0
one ABC
two CBA
three ABCBA
0
one ABC
two ABCCBA
three ABCCBAABCBA
数值型统计量
调用DataFrame.describe()方法
查看数值型列常见描述统计量:最大值、最小值、平均值、标准差、方差、分位数……
print(frame.describe())
结果:
count 8.00000
mean 3.50000
std 2.44949
min 0.00000
25% 1.75000
50% 3.50000
75% 5.25000
max 7.00000
非数值型统计量
会用来词频统计分析
## 非数值型
s = Series(list("aabc")*4)
print(s)
print(s.describe())
结果:
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
count 16
unique 3
top a
freq 8
dtype: object
唯一值、值计数与成员资格
去重查看唯一值
这里的唯一值实质去掉重复元素后所有的值,而不是只出现过一次的值
## 查看Series唯一值
print(s)
print(s.unique())
结果:
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
['a' 'b' 'c']
词频统计(值计数)
## 词频统计
print(s.value_counts(sort=False)) # 不排序
结果:
a 8
b 4
c 4
dtype: int64
使用Pandas的API进行词频统计
print(pd.value_counts(s).sort_values(ascending=False))
结果:
a 8
b 4
c 4
dtype: int64
a 8
b 4
c 4
dtype: int64
apply与value_counts结合,查看所有唯一元素的词频
## apply+value_counts结合查看每一列所有元素词频,空值补0
print(s.apply(pd.value_counts).fillna(0))
结果:
a b c
0 1.0 0.0 0.0
1 1.0 0.0 0.0
2 0.0 1.0 0.0
3 0.0 0.0 1.0
4 1.0 0.0 0.0
5 1.0 0.0 0.0
6 0.0 1.0 0.0
7 0.0 0.0 1.0
8 1.0 0.0 0.0
9 1.0 0.0 0.0
10 0.0 1.0 0.0
11 0.0 0.0 1.0
12 1.0 0.0 0.0
13 1.0 0.0 0.0
14 0.0 1.0 0.0
15 0.0 0.0 1.0
成员资格
成员资格,指的就是判断Series或DataFrame中的每个元素是否在指定的集合中,包含返回True,不包含返回False
## 成员则各
print(s.isin(['b','c']))
结果:
0 False
1 False
2 True
3 True
4 False
5 False
6 True
7 True
8 False
9 False
10 True
11 True
12 False
13 False
14 True
15 True
dtype: bool
*缺失值处理
判断空值(非空值)
用来判断Series或Pandas中的各个值是否为NaN,如果是返回True,不是返回False
## 缺失值处理
## 判断缺失值
print(s.isnull()) # 判断每个元素是否为空
print(s.notnull()) # 判断每个元素是否不为空
print(s.isnull().value_counts()) # 统计空值个数
print(s.notnull().value_counts()) # 统计非空值个数
结果:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
dtype: bool
0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
dtype: bool
False 16
dtype: int64
True 16
dtype: int64
Process finished with exit code 0
获取空值(非空值)
## 获取空值
print(s[s.isnull()])
## 获取非空值
print(s[s.notnull()])
结果:
Series([], dtype: object)
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
Process finished with exit code 0
缺失值处理
丢弃删除缺失值
Series
删除缺失值NaN后,返回非空数据和索引值的Series
## 缺失值处理
## 丢弃删除缺失值
from numpy import nan as NA
s = Series([1,NA,3.5,NA,7])
print(s)
print(s.dropna())
结果:
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
0 1.0
2 3.5
4 7.0
也可以通过下面的方式达到同样效果
print(s[s.notnull()]) # 相当于dropna
结果:
dtype: float64
0 1.0
2 3.5
4 7.0
dtype: float64
DataFrame
丢弃删除包含NaN的所有行,返回新的DataFrame
## DataFrame
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
print(data.dropna())
结果:
0 1 2
0 1.0 6.5 3.0
传入how选项,将只会删除所有列都为NaN的行
print(data.dropna(how="all")) # 一行全为NaN的时候才会删除
结果:
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
删除全为NaN的列
使用axis选项对列进行操作,一列全为NaN时删除一列
## 删除全为NaN的一列
data = DataFrame([[1.,6.5,NA],[1.,NA,NA],[NA,NA,NA],[NA,6.5,NA]])
print(data)
print(data.dropna(axis=1,how='all'))
结果:
0 1 2
0 1.0 6.5 NaN
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 NaN
0 1
0 1.0 6.5
1 1.0 NaN
2 NaN NaN
3 NaN 6.5
*填充缺失数据
一般填充
print(data.fillna(0))
结果:
0 1 2
0 1.0 6.5 NaN
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 NaN
0 1 2
0 1.0 6.5 0.0
1 1.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 6.5 0.0
指定列的NaN填充值
执行fillna()后,会生成当前DataFrame的副本对象,对副本进行填充,再将副本对象返回给变量
print(data.fillna({1:100,2:200}))
结果:
0 1 2
0 1.0 6.5 NaN
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 NaN
0 1 2
0 1.0 6.5 200.0
1 1.0 100.0 200.0
2 NaN 100.0 200.0
3 NaN 6.5 200.0
使用inplace选项可以直接修改原数据,不会产生新对象
data.fillna(300,inplace=True)# 直接修改原数据
print(data)
结果:
0 1 2
0 1.0 6.5 NaN
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 NaN
0 1 2
0 1.0 6.5 300.0
1 1.0 300.0 300.0
2 300.0 300.0 300.0
3 300.0 6.5 300.0
Pandas中的字符串操作
字符串对象常用方法
- split()
- strip()
- join()
- in
print('a' in piece)
结果:True
- index()
print('piece'.index('e'))
结果:2
- find()
print('piece'.find('e'))
结果:2
- count()
- replace()
序列解包
## 字符串操作
## 序列解包
piece = ['a','b','c','d']
first,second,third,fourth = piece
print(first,second,third,fourth)
结果:
a b c d
Pandas中的日期时间处理
Pandas数据加载、存储和解析
加载csv
## 数据加载、存储与解析
df = pd.read_csv('./Datas/ex1.csv')
print(df)
结果:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
或者使用另一种方法,需要指定分隔符
df = pd.read_table('./Datas/ex1.csv',sep=',')
print(df)
标题加载选项
header设置是否加载标题
print(pd.read_csv('./Datas/ex2.csv',header=None))
结果:
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
指定行索引
使用names选项参数
## 指定行索引
print(pd.read_csv('./Datas/ex2.csv',names=['one','two','three','four','five']))
结果:
one two three four five
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo
常用参数选项
- sep
- header
- index_col
- names
- skiprows
保存数据
## 保存数据为csv
df = pd.read_csv('./Datas/ex2.csv',names=['one','two','three','four','five'])
df.to_csv('./Datas/TestOut.csv',index=False,header=None)
结果:
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
Pandas数据聚合与分析计算
根据一个条件分组
## 数据聚合与分组计算
df = DataFrame({
"key1":list("aabba"),
"key2":["one","two","one","two","one"],
"data1":np.random.randn(5),
"data2":np.random.randn(5),
})
print(df)
grouped = df['data1'].groupby(df['key1'])# 创建GroupBy对象
for g in grouped:
print(g)
groupName,groupData = g ## 解包获取分组名和分组数据
print("------------------------------------")
print(grouped.mean())
结果:
key1 key2 data1 data2
0 a one -1.552278 0.623274
1 a two 0.089848 -0.610365
2 b one -0.129457 -0.033580
3 b two 0.574756 0.683179
4 a one -0.460428 0.290329
('a', 0 -1.552278
1 0.089848
4 -0.460428
Name: data1, dtype: float64)
------------------------------------
('b', 2 -0.129457
3 0.574756
Name: data1, dtype: float64)
------------------------------------
key1
a -0.640953
b 0.222650
Name: data1, dtype: float64
根据两个条件分组
会按条件分为两组,先按第一个条件一组,再按第二条件分组
## 根据两个条件分组
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
print(means)
结果:
key1 key2 data1 data2
0 a one -0.310925 0.296294
1 a two -0.775546 0.671051
2 b one 0.199706 0.074525
3 b two -0.534141 -1.387199
4 a one 0.500093 -0.834794
key1 key2
a one 0.094584
two -0.775546
b one 0.199706
two -0.534141
Name: data1, dtype: float64
查看每个分组的数据数量
## 查看每个分组数量
df.groupby(["key1","key2"]).size()
结果:
key1 key2 data1 data2
0 a one 0.939834 -0.264110
1 a two -0.187692 0.069275
2 b one -0.769847 -0.419222
3 b two 0.094817 -0.192999
4 a one -0.043378 -0.682592
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
内置聚合函数
- sum()
- max()
- min()
- mean()
- size()
- count()
- describe()
# 根据字典创建一个DataFrame
dict_obj = {'key1': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'],
'key2': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1, 10, 8),
'data2': np.random.randint(1, 10, 8)}
df_obj5 = DataFrame(dict_obj)
print(df_obj5.groupby("key1").sum())
结果:
data1 data2
key1
a 28 37
b 9 10
自定义聚合函数
tips = pd.read_csv("./Datas/tips.csv")
# 小费占总额百分比
tips['tip_pct'] = tips['tip']/tips['total_bill']
# 按分组选出最高的n个tip_pct值
# 自定义函数
def top_tip(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
# 先分组再调用函数映射
# print(tips.groupby("smoker").apply(top_tip))
# 根据是否吸烟和星期分组并函数映射
print(tips.groupby(['smoker','day']).apply(top_tip,n=1,column='total_bill'))
结果:
total_bill tip smoker day time size tip_pct
smoker day
No Fri 94 22.75 3.25 No Fri Dinner 2 0.142857
Sat 212 48.33 9.00 No Sat Dinner 4 0.186220
Sun 156 48.17 5.00 No Sun Dinner 6 0.103799
Thur 142 41.19 5.00 No Thur Lunch 5 0.121389
Yes Fri 95 40.17 4.73 Yes Fri Dinner 4 0.117750
Sat 170 50.81 10.00 Yes Sat Dinner 3 0.196812
Sun 182 45.35 3.50 Yes Sun Dinner 3 0.077178
Thur 197 43.11 5.00 Yes Thur Lunch 4 0.115982
分位数和桶分析
将一列或多列数据按照条件进行分组统计
# 分位数和桶分析
# 随机生成20个年龄数据
## 方法一:列表推导式
# import random
# l = [random.randint(0,100) for item in range(20)]
# print(l)
## 方法二:使用numpy生成随机整数
age = np.random.randint(0,100,size=(20))
# print(l)
## 对age进行分组
bin = [0,18,25,40,60,100] # 分组区间
age_catagory = pd.cut(age,bins=bin)
print(age_catagory.value_counts())
(0, 18] 1
(18, 25] 3
(25, 40] 4
(40, 60] 6
(60, 100] 5
dtype: int64
拆分成多块后,再结合groupby()实现对数据的桶分析(bucket)和分位数分析(quantile) 分组的边界值为一列中的最大值减去最小值,除以分组数
frame = pd.DataFrame({
"data1":np.random.randn(1000),
"data2":np.random.randn(1000)
})
print(frame.head())
data1 data2
0 0.071223 -1.224070
1 -0.476566 -1.073826
2 -1.113698 -1.338922
3 -0.403567 0.887953
4 -0.582945 0.519279
quartiles = pd.cut(frame["data1"],4)
print(quartiles.value_counts())
(0.0137, 1.451] 436
(-1.424, 0.0137] 430
(-2.867, -1.424] 74
(1.451, 2.889] 60
Name: data1, dtype: int64
将通过cut分桶后的Categorical对象直接传递到groupby,进行统计分析
def get_states(group):
return{
'min':group.min(),
'max':group.max(),
'count':group.count(),
'mean':group.mean()
}
grouped = frame['data2'].groupby(quartiles)
grouped.apply(get_states).unstack() # unstack行列转换
min | max | count | mean | |
---|---|---|---|---|
data1 | ||||
(-2.867, -1.424] | -2.037369 | 2.639223 | 74.0 | -0.070759 |
(-1.424, 0.0137] | -2.412002 | 2.685539 | 430.0 | 0.033512 |
(0.0137, 1.451] | -2.622187 | 3.090384 | 436.0 | 0.002669 |
(1.451, 2.889] | -1.968914 | 2.665809 | 60.0 | 0.074737 |
分组加权平均数和相关系数
算术平均数表示
$$
\bar{x} = \frac{1}{n}(x_1+x_2+x_3+…..+x_n)
$$
加权平均数表示
$$
\bar{x} = \frac{x_1\omega_1+x_2\omega_2+x_3\omega_3+…+x_n\omega_n}{\omega_1+\omega_2+…+\omega_n}
$$
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b',
'b'], 'data': np.random.randn(8), 'weights': np.random.rand(8)})
print(df)
category data weights
0 a -0.985415 0.529362
1 a -0.762489 0.872934
2 a 0.947137 0.690249
3 a 0.981109 0.619724
4 b 0.187739 0.098808
5 b -0.648343 0.479388
6 b -1.684569 0.467149
7 b -0.491817 0.563656
按组分为a,b,再将每个data和weights(权重)相加,再除以权重之和,即为加权平均数
可以利用category直接计算分组加权平均数
grouped = df.groupby('category')
get_wavg = lambda g : np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
category
a 0.027480
b -0.843019
dtype: float64
根据两列建立一一对应关系
data[data['mine_code']==140211011523].groupby('point_location')['point_code'].nunique()