pandas 和 numpy 学习记录

学习python也有一段时间了，之前一直在忙，也一直没时间整理自己的学习记录，这几天自己挤出了一点时间，整理了一些自己的学习记录

也希望自己能继续学习下去，也算是督促自己吧！在这个学习的过程，自己发现好像真的喜欢上了python，人生苦短，我用python，下一步，要开始实际的清洗和实现数据的可视化！

这篇文章是我在网上找到的一个numpy 和pandas的练习。网址如下https://www.hackerearth.com/zh/practice/machine-learning/data-manipulation-visualisation-r-python/tutorial-data-manipulation-numpy-pandas-python/tutorial/

有兴趣的可以去看下，由于这个练习后面涉及到了机器学习，所以机器学习这一part我就跳过了！

import numpy as np import pandas as pd # 第一个看一下numpy的版本 np.__version__ 1.16.2 # 创造一个list，从零到九的数 data = list(range(10)) data [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # # 把整型数据转换成字元串形式有很多种方法 [str(c) for c in data] # 这里用的是 [type(item) for item in data] [int, int, int, int, int, int, int, int, int, int] # 创造一个新的numpy数组 # 1创造一个全是零的数组 np.zeros(10,dtype = int) ? array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) # 创造一个 3行5列的数组 np.ones((3,5),dtype = float ) array([[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.]]) # 使用随机的数字创建3行5列的数组 np.random.randn(15).reshape(3,5) array([[ 1.17224013, -1.31714361, -0.57372094, -1.45059089, -0.94262067], [-0.84514455, 1.95279195, -1.4052422 , -0.09462012, 1.23075506], [-1.11058125, -0.82823642, -0.66151707, 0.71700257, -1.12698597]]) # 创建一个15以内的3行5列的数组 np.arange(15).reshape(3,5) array([[ 0, 1, 2, 3, 4], [ 5, 6, 7, 8, 9], [10, 11, 12, 13, 14]]) # 创建一个全是某一个数字的数组 np.full((3,5),3) array([[3, 3, 3, 3, 3], [3, 3, 3, 3, 3], [3, 3, 3, 3, 3]]) # 创建一个有固定步长的数组(如步长是二) np.arange(0,20,2) array([ 0, 2, 4, 6, 8, 10, 12, 14, 16, 18]) # 创建一个均分某个区间的数组 np.linspace(1,2,5,dtype = int) array([1, 1, 1, 1, 2]) # 创建一个均分区间的数组默认的数据类型 np.linspace(2,4,5) array([2. , 2.5, 3. , 3.5, 4. ]) # 创建一个3*3的随机数组 np.random.normal(0,1,(3,3)) array([[ 1.00737997, -1.53233115, -0.74612576], [-1.1850143 , -0.1165628 , -0.67767718], [-1.01919855, 0.08808351, -0.67987634]]) # 创建一个标准矩阵（数组）对应线代中的E，原谅我我了叫什么名字了好像是叫正矩阵 np.eye(3) ? ? array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) # Numpy中很重要的一个元素索引 #值得注意的是，index是从零开始的，据说这是为了避免在计算某个数的内存时要减一的麻烦 x1 = np.array([4,3,4,4,8,4]) x1 array([4, 3, 4, 4, 8, 4]) # 表示出索引为零的值 x1[0] 4 # 表示第5个元素的值 x1[4] 8 #表示出最后一个数据的值 x1[-1] 4 #表示出倒数第二个的值 x1[-2] 8 # 对于多元数组，我们需要行和列的数字去定位 # 这就是索引的作用 x2 = np.arange(9).reshape(3,3) x2 array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) #找出坐标是第二行第三列的值 x2[1,2] # 需要注意的是，python默认是从零开始的，这个可以参照后面的pandas中的索引 5 # 我想看一看坐标是第一行第二列的数据 x2[0,1] 1 # 同样也可以使用-1这种形式 x2[2,-1] ? 8 # 再看一个例子 x2[1,-1] 5 # 可以给某一个数组赋值 x2[1,-1] = 10 x2 array([[ 0, 1, 2], [ 3, 4, 10], [ 6, 7, 8]]) x2[0,-1] 2 x2[1,-1] 10 x2[1,-2] 4 x2[-1,-1] 8 x2[-1,1] 7 # 可以给某一个值进行赋值 x2[1,-1] = 19 x2 array([[ 0, 1, 2], [ 3, 4, 19], [ 6, 7, 8]]) # 下一步我们要看一下切片的做法 # 切片个人感觉可以理解为按照一定的刻度进行切分 # 首先我们来生成一组数据 x = np.arange(10) x array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # 从开始切到第五个数据位置 x[:5] array([0, 1, 2, 3, 4]) #从第五个数据开始往后切 x[5:] array([5, 6, 7, 8, 9]) #从第5个到第7个数据 x[5:8] array([5, 6, 7]) # 每隔一个切分 x[::2] array([0, 2, 4, 6, 8]) # 从第一个数据开始每隔一个数切分 x[1::2] array([1, 3, 5, 7, 9]) # 把数组倒序处理 x[::-1] array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) # 下面的几行代码是数组的合并等操作 #使用concatenate完成合并的操作 x = np.array([1,2,3]) y = np.array([3,2,1]) z = np.array([21,21,21]) np.concatenate([x,y,z]) # 需要注意的是这里类似于集合的并，但是合并以后不能改变原有元素数 array([ 1, 2, 3, 3, 2, 1, 21, 21, 21]) # 对于2维数组，也有著类似的做法 grid = np.array([[1,2,3],[2,3,4]]) np.concatenate([grid,grid]) array([[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 3, 4]]) # 使用axis语句可以控制数组是按照行合并还是列合并 np.concatenate([grid,grid],axis=1)#需要注意的是axis = 1代表的是列，这一点在后面pandas中用的很多 array([[1, 2, 3, 1, 2, 3], [2, 3, 4, 2, 3, 4]]) np.concatenate([grid,grid],axis=0) #默认的是对行操作，这一点可以通过shift+tab键来看一下 np.concatenate <function numpy.concatenate> #上面的两个都是针对相同维数的数组进行操作的 # 如果要合并的两个数组是不同的维数怎么办呢？ # 可以使用 np.vstack 和np.hstack x = np.array([3,4,5]) grid =np.array([[1,2,3],[17,18,19]]) np.vstack([x,grid]) array([[ 3, 4, 5], [ 1, 2, 3], [17, 18, 19]]) # 水平方向的相加 z = np.array([[9],[8]]) np.hstack([grid,z]) array([[ 1, 2, 3, 9], [17, 18, 19, 8]]) #当然这个合并是有先后顺序的 np.hstack([z,grid]) array([[ 9, 1, 2, 3], [ 8, 17, 18, 19]]) # 我们来看一看如果数据结构不一致是是否可以合并 x = np.array([1,1,1,2]) np.vstack([x,grid]) # 可以看到这里报错了，因为要保证要合并的数组要是匹配的 --------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-188-68a013f30813> in <module> 1 # 我们来看一看如果数据结构不一致是是否可以合并 2 x = np.array([1,1,1,2]) ----> 3 np.vstack([x,grid]) 4 # 可以看到这里报错了，因为要保证要合并的数组要是匹配的

C:ProgramDataAnaconda3envspy3libsite-packages
umpycoreshape_base.py in vstack(tup)
281 """
282 _warn_for_nonsequence(tup)
--> 283 return _nx.concatenate([atleast_2d(_m) for _m in tup], 0)
284
285

ValueError: all the input array dimensions except for the concatenation axis must match exactly

# 下面让我们看一看split 功能
x = np.arange(10)
x
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
x1,x2,x3 = np.split(x,[3,6])
print(x1,x2,x3)
[0 1 2] [3 4 5] [6 7 8 9]
grid = np.arange(16).reshape((4,4))
grid
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15]])
upper,lower = np.vsplit(grid,[2])
grid
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[12, 13, 14, 15]])
upper,lower = np.vsplit(grid,[2])
print(upper)
print(lower)
[[0 1 2 3]
[4 5 6 7]]
[[ 8 9 10 11]
[12 13 14 15]]
### not to bad
# 一般pandas的标准名称是pd，所以我就使用pd作为别名
# pandas 主要有DataFrame and Series
# 创建一个DataFrame 这个类似于excel表格,可以使用字典
data = pd.DataFrame({Country: [Russia,Colombia,Chile,Equador,Nigeria],
Rank:[121,40,100,130,11]})
data
Country Rank
0 Russia 121
1 Colombia 40
2 Chile 100
3 Equador 130
4 Nigeria 11
数字的平均数
# 使用describe 看一下数据的描述性统计
data.describe()
# 因为这个数据第二列是纯数字，所以描述的内容很全
# count：计算存在的数据
# mean：数字的平均数
# std：标准差
#min：数字最小值
#max：最大值
Rank
count 5.000000
mean 80.400000
std 52.300096
min 11.000000
25% 40.000000
50% 100.000000
75% 121.000000
max 130.000000
# 想要知道更多的数据信息，可以使用info
data.info
<bound method DataFrame.info of Country Rank
0 Russia 121
1 Colombia 40
2 Chile 100
3 Equador 130
4 Nigeria 11>
data.info()
<class pandas.core.frame.DataFrame>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country 5 non-null object
Rank 5 non-null int64
dtypes: int64(1), object(1)
memory usage: 160.0+ bytes
# 创建一个新的dataframe
data = pd.DataFrame({group:[a,a,a,b,b,b,c,c,c],ounces:[4,3,12,6,7.5,8,3,5,6]})
data
group ounces
0 a 4.0
1 a 3.0
2 a 12.0
3 b 6.0
4 b 7.5
5 b 8.0
6 c 3.0
7 c 5.0
8 c 6.0
# 按照ounce列进行排序
data.sort_values(by=[ounces],ascending = True,inplace= False)
# 可以仔细看一下这个数据是怎么排序的一个过程这里是按照所有数据由小到大的排序
group ounces
1 a 3.0
6 c 3.0
0 a 4.0
7 c 5.0
3 b 6.0
8 c 6.0
4 b 7.5
5 b 8.0
2 a 12.0
# 按照ounce列进行排序
data.sort_values(by=[ounces],ascending = True,inplace= True)
data
# 可以仔细看一下这个数据是怎么排序的一个过程这里是按照所有数据由小到大的排序
group ounces
1 a 3.0
6 c 3.0
0 a 4.0
7 c 5.0
3 b 6.0
8 c 6.0
4 b 7.5
5 b 8.0
2 a 12.0
data
group ounces
1 a 3.0
6 c 3.0
0 a 4.0
7 c 5.0
3 b 6.0
8 c 6.0
4 b 7.5
5 b 8.0
2 a 12.0
data1 = data.copy()
data1
group ounces
1 a 3.0
6 c 3.0
0 a 4.0
7 c 5.0
3 b 6.0
8 c 6.0
4 b 7.5
5 b 8.0
2 a 12.0
# 我们按照多列进行排序,下面语句的意思的按照group和ounces排序，group按照升序排列，ounces按照降序拍排列
data.sort_values(by=[group,ounces],ascending=[True,False],inplace=False)
group ounces
2 a 12.0
0 a 4.0
1 a 3.0
5 b 8.0
4 b 7.5
3 b 6.0
8 c 6.0
7 c 5.0
6 c 3.0
# 去除重复值
data = pd.DataFrame({k1:[one]*3 + [two]*4, k2:[3,2,1,3,3,4,4]})
data
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
4 two 3
5 two 4
6 two 4
# 对数据排序
data.sort_values(by=k2)
?
k1 k2
2 one 1
1 one 2
0 one 3
3 two 3
4 two 3
5 two 4
6 two 4
# 可以看到这个数据集有重复的，那么我们来去重
#首先来看一下有哪些重复值
data.duplicated()
#因为这个数据量很小，所以可以直接看到，但是当数据量很大的时候，使用duplicte就可以直接判断是否存在重复值了
?
0 False
1 False
2 False
3 False
4 True
5 False
6 True
dtype: bool
# 删除重复值
data.drop_duplicates()
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
5 two 4
# 看一下data
data
#可以看到这里data是没有改变的，这是因为python在执行的过程中copy了一份，然后drop_duplicate 默认是不会直接改变原始数据的
#也就是inplace = False
?
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
4 two 3
5 two 4
6 two 4
data.drop_duplicates()
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
5 two 4
# 下面删除指定列的重复值
data.drop_duplicates(subset=k1)
?
k1 k2
0 one 3
3 two 3
data = pd.DataFrame({food: [bacon, pulled pork, bacon, Pastrami,corned beef, Bacon, pastrami, honey ham,nova lox],
ounces: [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
# 线面我要新建一列数据，添加到原有数据中
#首先使用一个dictionary（字典）
meat_to_animal = {bacon:pig,pulled pork:pig,pastrami:cow,corned beef:cow,honey ham:pig,nova lox:salmon}
#下面要创建一个函数
def meat_2_anmial(series):
if series[food]==bacon:
return pig
elif series[food] == pulled pork:
return pig
elif series[food] == pastrami:
return cow
elif series[food] == corned beef:
return cow
elif series[food] == honey ham:
return pig
else:
return salmon
# 使用map 遍历上述字典
data[animal] = data[food].map(str.lower).map(meat_to_animal)
data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
# 第二种方式是使用lambda函数，即匿名函数
lower = lambda x: x.lower()
data[food] = data[food].apply(lower)
data[animal2] = data.apply(meat_2_animal, axis=columns)
data
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-213-69a41af39b0d> in <module>
2 lower = lambda x: x.lower()
3 data[food] = data[food].apply(lower)
----> 4 data[animal2] = data.apply(meat_2_animal, axis=columns)
5 data

NameError: name meat_2_animal is not defined

# 还有一种生成一个新列的方式是
data.assign(new_variable = data[ounces]*10)
food ounces animal new_variable
0 bacon 4.0 pig 40.0
1 pulled pork 3.0 pig 30.0
2 bacon 12.0 pig 120.0
3 pastrami 6.0 cow 60.0
4 corned beef 7.5 cow 75.0
5 bacon 8.0 pig 80.0
6 pastrami 3.0 cow 30.0
7 honey ham 5.0 pig 50.0
8 nova lox 6.0 salmon 60.0
# 在实际应用中，有时会因为各种原因会产生各种数据的缺失
#这里使用 pd.isna 来判断是否存在na
data = pd.Series([1,-999,2,-999,-1000,3])
data
0 1
1 -999
2 2
3 -999
4 -1000
5 3
dtype: int64
# 使用replace将-999换成NaN
data.replace(-999,np.nan,inplace = True)
data
?
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
# 同样可以一次替换多个值
data = pd.Series([1,-999,2,-999,-10000,3])
data.replace([-999,-10000],np.nan,inplace = True)
data
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
# 现在我们来看一看如何判断是否存在na
data.isna()
# 可以看到这里存在na的
0 False
1 True
2 False
3 True
4 True
5 False
dtype: bool
# 有时候数据的列名不是很好理解，这个时候就可以使用rename的形式，来重命名
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=[Ohio, Colorado, New York],columns=[one, two, three, four])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
# 重命名
data.rename(index = {Ohio:SanF},columns = {one:one_p,two:two_p},inplace = True)
data
one_p two_p three four
SanF 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
# 同样使用string 函数来改变行名和列名的大小写字母
# str.upper 是把所有字母都写成大写
# str.title 是把首字母大写
?
data.rename(index = str.upper,columns = str.title,inplace= True)
data
One_P Two_P Three Four
SANF 0 1 2 3
COLORADO 4 5 6 7
NEW YORK 8 9 10 11
# 使用pd.cut 这里可以理解为分段函数
# demo
ages = [20,22,25,27,21,23,37,31,61,45,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
?
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
# 包含右边区间的值
pd.cut(ages,bins,right = False)
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]
cats.labels
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-224-af28c901402c> in <module>
----> 1 cats.labels

AttributeError: Categorical object has no attribute labels

# 看一下各个阶段都有几个数据
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
# 我们也可以使用有实际意义的名字去代替
bin_names = [Youth,YoungAdult,MiddleAge,Senior]
new_cats = pd.cut(ages,bins,labels = bin_names)
pd.value_counts(new_cats)
Youth 5
MiddleAge 3
YoungAdult 3
Senior 1
dtype: int64
# 也可以使用累加
pd.value_counts(new_cats).cumsum()
Youth 5
MiddleAge 8
YoungAdult 11
Senior 12
dtype: int64
# 下面看一下GROUP group很类似于sql中的group 也就是所谓的分组
df = pd.DataFrame({key1 : [a, a, b, b, a],
key2 : [one, two, one, two, one],
data1 : np.random.randn(5),
data2 : np.random.randn(5)})
df
key1 key2 data1 data2
0 a one -0.687908 -0.456465
1 a two 0.040095 -1.286556
2 b one -1.771255 -2.742873
3 b two 0.620357 -0.959591
4 a one -1.074818 1.024162
# 第一个是按照key1 去分组计算data1的平均值
grouped = df[data1].groupby(df[key1])
grouped.mean()
# 其实这里只是求得了一类，还有很多函数，比如求最值，等等
key1
a -0.574211
b -0.575449
Name: data1, dtype: float64
# 下面看一下dataframe的切分
dates = pd.date_range(20130101,periods = 6)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns = list(ABCD))
df
A B C D
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404
2013-01-02 0.715821 0.022899 -0.085071 0.284433
2013-01-03 -0.130514 -1.364424 0.506235 0.528120
2013-01-04 0.126970 0.259528 -0.696607 -0.112268
2013-01-05 -0.107711 -1.349693 -0.320895 1.399890
2013-01-06 -0.425530 1.208063 0.277692 0.412692
# 得到dataframe的前几行
df[:3]
A B C D
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404
2013-01-02 0.715821 0.022899 -0.085071 0.284433
2013-01-03 -0.130514 -1.364424 0.506235 0.528120
# 根据日期范围来切分
df[20130101:20130104]
A B C D
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404
2013-01-02 0.715821 0.022899 -0.085071 0.284433
2013-01-03 -0.130514 -1.364424 0.506235 0.528120
2013-01-04 0.126970 0.259528 -0.696607 -0.112268
df[[20130101],[20130104]]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-238-379945f05320> in <module>
----> 1 df[[20130101],[20130104]]

C:ProgramDataAnaconda3envspy3libsite-packagespandascoreframe.py in __getitem__(self, key)
2925 if self.columns.nlevels > 1:
2926 return self._getitem_multilevel(key)
-> 2927 indexer = self.columns.get_loc(key)
2928 if is_integer(indexer):
2929 indexer = [indexer]

C:ProgramDataAnaconda3envspy3libsite-packagespandascoreindexesase.py in get_loc(self, key, method, tolerance)
2655 backfill or nearest lookups)
2656 try:
-> 2657 return self._engine.get_loc(key)
2658 except KeyError:
2659 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: ([20130101], [20130104]) is an invalid key

ab两列
#按照列名进行切分
df.loc[:,[A,B]] #这句代码是得到ab两列
A B
2013-01-01 -0.605596 -2.268895
2013-01-02 0.715821 0.022899
2013-01-03 -0.130514 -1.364424
2013-01-04 0.126970 0.259528
2013-01-05 -0.107711 -1.349693
2013-01-06 -0.425530 1.208063
#使用具体的行和列的坐标去定位分析
df.loc[20130102:20130103,[A,B]]
A B
2013-01-02 0.715821 0.022899
2013-01-03 -0.130514 -1.364424
# 按照列索引名来切分
df.iloc[3] # 返回的是第4行的数据
?
A 0.126970
B 0.259528
C -0.696607
D -0.112268
Name: 2013-01-04 00:00:00, dtype: float64
# 使用特定的行和列去定位
df.iloc[[1,5],[0,2]]
A C
2013-01-02 0.715821 -0.085071
2013-01-06 -0.425530 0.277692

df[df.A > 1]
A B C D
df2
# 使用硬copy
df2 = df.copy()
df2[E] = [one,one,two,three,four,three]
df2
A B C D E
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404 one
2013-01-02 0.715821 0.022899 -0.085071 0.284433 one
2013-01-03 -0.130514 -1.364424 0.506235 0.528120 two
2013-01-04 0.126970 0.259528 -0.696607 -0.112268 three
2013-01-05 -0.107711 -1.349693 -0.320895 1.399890 four
2013-01-06 -0.425530 1.208063 0.277692 0.412692 three
列
# 根据列数值选择行
df2[df2[E].isin([two,four])]
A B C D E
2013-01-03 -0.130514 -1.364424 0.506235 0.52812 two
2013-01-05 -0.107711 -1.349693 -0.320895 1.39989 four
# 选择出不包含two和four的行
df2[~df2[E].isin([two,four])]
A B C D E
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404 one
2013-01-02 0.715821 0.022899 -0.085071 0.284433 one
2013-01-04 0.126970 0.259528 -0.696607 -0.112268 three
2013-01-06 -0.425530 1.208063 0.277692 0.412692 three
d
#下面使用query的形式来得到想要的数据
df.query(A > C)
A B C D
2013-01-02 0.715821 0.022899 -0.085071 0.284433
2013-01-04 0.126970 0.259528 -0.696607 -0.112268
2013-01-05 -0.107711 -1.349693 -0.320895 1.399890
# 使用逻辑或
df.query(A < B | C > A)
A B C D
2013-01-01 -0.605596 -2.268895 -0.134708 0.250404
2013-01-03 -0.130514 -1.364424 0.506235 0.528120
2013-01-04 0.126970 0.259528 -0.696607 -0.112268
2013-01-06 -0.425530 1.208063 0.277692 0.412692
pivot_table的几个例子
# excel这么流行的原因在于数据透视表
#下面我们将会看一看pivot_table的几个例子
data = pd.DataFrame({group: [a, a, a, b,b, b, c, c,c],
ounces: [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
group ounces
0 a 4.0
1 a 3.0
2 a 12.0
3 b 6.0
4 b 7.5
5 b 8.0
6 c 3.0
7 c 5.0
8 c 6.0
# 计算每一组的平均值
data.pivot_table(values =ounces,index=group,aggfunc=np.mean)
?
ounces
group
a 6.333333
b 7.166667
c 4.666667
#统计每一组的数据数
data.pivot_table(values=ounces,index=group,aggfunc=count)
ounces
group
a 3
b 3
c 3
r
#截至目前，已经了解了很多的基本pandas numpy的操作
#下面将看一看具体的数据处理的过程
#数据来源于https://s3-ap-southeast-1.amazonaws.com/he-public-data/datafiles19cdaf8.zip
train = pd.read_csv(rE:pythondatafiles19cdaf8 rain.csv)
test = pd.read_csv(rE:pythondatafiles19cdaf8 est.csv)
#看一下数据的基础信息
train.info()
<class pandas.core.frame.DataFrame>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age 32561 non-null int64
workclass 30725 non-null object
fnlwgt 32561 non-null int64
education 32561 non-null object
education.num 32561 non-null int64
marital.status 32561 non-null object
occupation 30718 non-null object
relationship 32561 non-null object
race 32561 non-null object
sex 32561 non-null object
capital.gain 32561 non-null int64
capital.loss 32561 non-null int64
hours.per.week 32561 non-null int64
native.country 31978 non-null object
target 32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
#可以看到train set 共有32561行15列
#我们看一下train set的前五行
train.head()
age workclass fnlwgt education education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country target
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
# 看一下有多少缺失值
nans = train.shape[0] - train.dropna().shape[0]
print(%d rows missing values in the train data %nans)
nand = test.shape[0] - test.dropna().shape[0]
print(%d rows have missing values in the test data %nand)
2399 rows missing values in the train data
1221 rows have missing values in the test data
# 看一下哪些列有缺失数据
train.isnull().sum()
age 0
workclass 1836
fnlwgt 0
education 0
education.num 0
marital.status 0
occupation 1843
relationship 0
race 0
sex 0
capital.gain 0
capital.loss 0
hours.per.week 0
native.country 583
target 0
dtype: int64
cat = train.select_dtypes(include =[0])
cat.apply(pd.Series.nunique)
# 看一下每一列的非重复变数
cat = train.select_dtypes(include=[O])
cat.apply(pd.Series.nunique)
workclass 8
education 16
marital.status 7
occupation 14
relationship 6
race 5
sex 2
native.country 41
target 2
dtype: int64
# 因为每一列对于缺失值的处理都不相同
#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna(Private,inplace=True)
?
?
#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna(Prof-specialty,inplace=True)
?
?
#Native Country
train[native.country].value_counts(sort=True)
train[native.country].fillna(United-States,inplace=True)
# 看一下处理以后还有没有缺失的数据
train.isnull().sum()
age 0
workclass 0
fnlwgt 0
education 0
education.num 0
marital.status 0
occupation 0
relationship 0
race 0
sex 0
capital.gain 0
capital.loss 0
hours.per.week 0
native.country 0
target 0
dtype: int64
train.target.value_counts()/train.shape[0]
train.target.value_counts()/train.shape[0]
<=50K 0.75919
>50K 0.24081
Name: target, dtype: float64
0
train.shape[0]
32561
1
train.shape[1]
15
?
Install selected packages