文章目录[隐藏]
pandas
数据处理库
读取文件
pandas.read_csv('xxxx.csv')
import pandas as pd
m = pd.read_csv('m0.csv')
print(type(m))
print(m.dtypes)
# 字符值为object
# print(help(pd.read_csv))
# <class 'pandas.core.frame.DataFrame'>
# noteid int64
# notebook object
# username object
# date object
# dtype: object
显示数据
m.head() #把数据显示出来,默认显示前五条
# m.head(3) #显示前3条
# m.tail(4) # 显示后四行
# print(m.columns) # 显示列的指标
# print(m.shape) # 查看数据维度(m,n)=》表示总共有m个样本,每个样本有n个指标
取数据
# location
### 取第一个数据
print(m.loc[0])
print('----------')
print(m.loc[3])
### 取从3开始到6结束
m.loc[3:6]
### 取1,3,5
id = {1,3,5}
m.loc[id]
### 一列一列取
col = m['noteid'] # 列名来定义,如无则默认为第一行为列名
print(col)
### 取多列
col = ['noteid','username']
data = m[col]
print(data)
# 取以什么来结尾的列
# 比如找以(kg)结尾的
import pandas as pd
m = pd.read_csv('m0.csv')
col_names = m.columns.tolist() #把当前列名变成list
print(col_names)
gram_columns = []
for c in col_names:
if c.endswith("(kg)"): # 以(kg)结尾
gram_columns.append(c) #追加
gram_def = m[gram_columns]
print(gram_def.head(3))
# noteid 0
# school A
# username adads
# height(cm) 160
# weight(kg) 40
# date 2019/8/27
# Name: 0, dtype: object
# ----------
# noteid 3
# school D
# username zcvxvzxcv
# height(cm) 163
# weight(kg) 43
# date 2019/8/30
# Name: 3, dtype: object
运算
# 四则运算
div_m = m['height(cm)'] / 100
print(div_m)
# 0 1.60
# 1 1.61
# 2 1.62
# 3 1.63
# 4 1.64
# 5 1.65
# 6 1.66
# 7 1.67
# 8 1.68
# 9 1.69
# 10 1.70
# Name: height(cm), dtype: float64
新加一列
# 相同维度的运算
import pandas as pd
m = pd.read_csv('m0.csv')
data = m['height(cm)'] * m['weight(kg)']
print(data)
print(data.shape)
print('======新加一列======')
height_m = m['height(cm)'] / 100
m['height(m)'] = height_m # 是添加了,但并未写入文件
print(m['height(m)'])
print(m)
# 0 6400
# 1 6601
# 2 6804
# 3 7009
# 4 7216
# 5 7425
# 6 7636
# 7 7849
# 8 8064
# 9 8281
# 10 8500
# dtype: int64
# (11,)
# ======新加一列======
# 0 1.60
# 1 1.61
# 2 1.62
# 3 1.63
# 4 1.64
# 5 1.65
# 6 1.66
# 7 1.67
# 8 1.68
# 9 1.69
# 10 1.70
# Name: height(m), dtype: float64
# noteid school username height(cm) weight(kg) date height(m)
# 0 0 A adads 160 40 2019/8/27 1.60
# 1 1 B sdfsadf 161 41 2019/8/28 1.61
# 2 2 C sdfasdf 162 42 2019/8/29 1.62
# 3 3 D zcvxvzxcv 163 43 2019/8/30 1.63
# 4 4 E sdfasf 164 44 2019/8/31 1.64
# 5 5 支持v在v在 必胜德国法国 165 45 2019/9/1 1.65
# 6 6 在v秩序册 在v出租车v 166 46 2019/9/2 1.66
# 7 7 支持v在v从 在v从中选出v 167 47 2019/9/3 1.67
# 8 8 v自行车v自行车v 在v出租车v 168 48 2019/9/4 1.68
# 9 9 在v现在v 自行车v在v从 169 49 2019/9/5 1.69
# 10 10 豆腐干豆腐干大锅饭 在v自行车v 170 50 2019/9/6 1.70
排序
import pandas as pd
m = pd.read_csv('m0.csv')
print('-------升序------')
m.sort_values("weight(kg)",inplace=True) # 默认ascending为Ture,用来升序
print(m["weight(kg)"])
print('-------降序------')
m.sort_values("weight(kg)",inplace=True,ascending=False) # 默认ascending为Ture,用来升序
print(m['weight(kg)'])
# -------升序------
# 0 40
# 1 41
# 3 43
# 4 44
# 6 46
# 7 47
# 8 48
# 9 49
# 10 50
# 5 66
# 2 888
# Name: weight(kg), dtype: int64
# -------降序------
# 2 888
# 5 66
# 10 50
# 9 49
# 8 48
# 7 47
# 6 46
# 4 44
# 3 43
# 1 41
# 0 40
# Name: weight(kg), dtype: int64
NAN的含义
pandas认为NAN为缺失值,或打印不出来的值。一般把缺失值放到最后
math、numpy、pandas NaN 判断
>> np.nan == np.nan
False
>> np.nan is np.nan
True
>> math.nan is np.nan
False
>> np.isnan(math.nan)
True
1. 判断 ndarray 中是否存在 nan
>> c = np.array([ 1., 2., np.nan, 3., 4.])
>> np.isnan(c)
array([False, False, True, False, False])
-
注意
>> np.nan != np.nan True >> np.nan in c False
-
上述方案要么返回的是一个序列,要么给出的是错误的结果。判断 numpy 下的多维数组中是否存在 nan 的简单方式:
>> np.isnan(np.min(c)) True >> np.isnan(np.sum(c)) True
-
因为
>> np.min(c) nan >> np.sum(c) nan
2. 将 nan 填充为均值
>> c = np.array([ 1., 2., np.nan, 3., 4.])
>> c[np.isnan(c)] = np.mean(c[~np.nan(c)])
版权声明:本文为CSDN博主「Inside_Zhang」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。原文链接:https://blog.csdn.net/lanchunhui/article/details/80399681
NAN例子
import pandas as pd
import numpy as np
data = pd.read_csv('titanic/train.csv') # https://www.kaggle.com/c/titanic/data
data.head()
#
# 数据内容
#
# pclass 仓位等级
# SibSp 兄弟姐妹数量
# Parch 父母/子女
# Fare 船票价格
# Cabin 床仓编号/NaN无该值
# Embarked 登船地点/码头
print('-------------')
age = data["Age"]
print(age.loc[0:10]) # 读取前十个值
print('@@@@@@@@@@@@@@@@@')
age_is_null = pd.isnull(age) # 判断缺失值,false则不是,ture则是缺失值
print(age_is_null)
print('@@@@@@@@@@@@@@@@')
age_null_true = age[age_is_null] # 筛选,这里传入的为(true/false),把true的留下来
print(age_null_true)
print('@@@@@@@@@@@@@@@@')
age_null_count = len(age_null_true) # 当前长度
print(age_null_count)
# 未处理缺失值的情况
print('************未处理的情况**************')
mean_age = sum(data['Age']) / len(data['Age']) # 有缺失值则结果为NaN,
print(mean_age)
# 处理后的情况
print('&&&&&&&&&&&&&&处理后的情况&&&&&&&&&&')
good_ages = data['Age'][age_is_null == False]
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)
print('&&&&&&&&&&&&&&Pandas函数mean()来实现&&&&&&&&&&')
# pandas默认的函数来实现以上功能
correct_mean_age = data['Age'].mean()
print(correct_mean_age)
# 每个仓位等级的平均价格
print('每个仓位等级的平均价格')
passanger_classes = [1,2,3]
fares_by_class = {}
for this_class in passanger_classes:
pclass_rows = data[data["Pclass"] == this_class ]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print(fares_by_class)
# pandas快速来实现
## 函数pivot_table
print('依靠函数pivot_table来实现上述功能')
passanger_survival = data.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean) # index:以谁为基准,values:index和什么的关系,aggfunc:指什么关系
print(passanger_survival)
print('================================')
#
# 默认求平均值
#
passanger_age = data.pivot_table(index="Pclass",values="Age") # 求平均年龄,少写一个aggfun。按照默认求均值来操作
print(passanger_age)
print('++++++++++++++++++++++++++++++++')
port_stats = data.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) #一个量和其它两个量之间的关系
print(port_stats)
## 函数 dropna
print('==============dropna/把缺失值丢掉==================')
#specifying axis = 1 or axis="columns" will drop any columns that have null values
drop_na_columns = data.dropna(axis=1)
print(drop_na_columns)
print('++==++')
new_passanger_survival = data.dropna(axis = 0,subset=['Age','Sex']) # 如果这俩列有缺失值,则把有缺失值当前对应的行给去掉
print(new_passanger_survival)
print('^^^^^^^^^^^查找/定位到一个具体值^^^^^^^^^^^^^')
row_index_83_age = data.loc[83,"Age"] # 83表示行,"Age"表示这一行"Age"这一列;一下同理
row_index_1000_pclass = data.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)
# -------------
# 0 22.0
# 1 38.0
# 2 26.0
# 3 35.0
# 4 35.0
# 5 NaN
# 6 54.0
# 7 2.0
# 8 27.0
# 9 14.0
# 10 4.0
# Name: Age, dtype: float64
# @@@@@@@@@@@@@@@@@
# 0 False
# 1 False
# 2 False
# 3 False
# 4 False
# ...
# 886 False
# 887 False
# 888 True
# 889 False
# 890 False
# Name: Age, Length: 891, dtype: bool
# @@@@@@@@@@@@@@@@
# 5 NaN
# 17 NaN
# 19 NaN
# 26 NaN
# 28 NaN
# ..
# 859 NaN
# 863 NaN
# 868 NaN
# 878 NaN
# 888 NaN
# Name: Age, Length: 177, dtype: float64
# @@@@@@@@@@@@@@@@
# 177
# ************未处理的情况**************
# nan
# &&&&&&&&&&&&&&处理后的情况&&&&&&&&&&
# 29.69911764705882
# &&&&&&&&&&&&&&Pandas函数mean()来实现&&&&&&&&&&
# 29.69911764705882
# 每个仓位等级的平均价格
# {1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
# 依靠函数pivot_table来实现上述功能
# Fare
# Pclass
# 1 84.154687
# 2 20.662183
# 3 13.675550
# ================================
# Age
# Pclass
# 1 38.233441
# 2 29.877630
# 3 25.140620
# ++++++++++++++++++++++++++++++++
# Fare Survived
# Embarked
# C 10072.2962 93
# Q 1022.2543 30
# S 17439.3988 217
# ==============dropna/把缺失值丢掉==================
# PassengerId Survived Pclass \
# 0 1 0 3
# 1 2 1 1
# 2 3 1 3
# 3 4 1 1
# 4 5 0 3
# .. ... ... ...
# 886 887 0 2
# 887 888 1 1
# 888 889 0 3
# 889 890 1 1
# 890 891 0 3
# Name Sex SibSp Parch \
# 0 Braund, Mr. Owen Harris male 1 0
# 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 1 0
# 2 Heikkinen, Miss. Laina female 0 0
# 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 1 0
# 4 Allen, Mr. William Henry male 0 0
# .. ... ... ... ...
# 886 Montvila, Rev. Juozas male 0 0
# 887 Graham, Miss. Margaret Edith female 0 0
# 888 Johnston, Miss. Catherine Helen "Carrie" female 1 2
# 889 Behr, Mr. Karl Howell male 0 0
# 890 Dooley, Mr. Patrick male 0 0
# Ticket Fare
# 0 A/5 21171 7.2500
# 1 PC 17599 71.2833
# 2 STON/O2. 3101282 7.9250
# 3 113803 53.1000
# 4 373450 8.0500
# .. ... ...
# 886 211536 13.0000
# 887 112053 30.0000
# 888 W./C. 6607 23.4500
# 889 111369 30.0000
# 890 370376 7.7500
# [891 rows x 9 columns]
# ++==++
# PassengerId Survived Pclass \
# 0 1 0 3
# 1 2 1 1
# 2 3 1 3
# 3 4 1 1
# 4 5 0 3
# .. ... ... ...
# 885 886 0 3
# 886 887 0 2
# 887 888 1 1
# 889 890 1 1
# 890 891 0 3
# Name Sex Age SibSp \
# 0 Braund, Mr. Owen Harris male 22.0 1
# 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
# 2 Heikkinen, Miss. Laina female 26.0 0
# 3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
# 4 Allen, Mr. William Henry male 35.0 0
# .. ... ... ... ...
# 885 Rice, Mrs. William (Margaret Norton) female 39.0 0
# 886 Montvila, Rev. Juozas male 27.0 0
# 887 Graham, Miss. Margaret Edith female 19.0 0
# 889 Behr, Mr. Karl Howell male 26.0 0
# 890 Dooley, Mr. Patrick male 32.0 0
# Parch Ticket Fare Cabin Embarked
# 0 0 A/5 21171 7.2500 NaN S
# 1 0 PC 17599 71.2833 C85 C
# 2 0 STON/O2. 3101282 7.9250 NaN S
# 3 0 113803 53.1000 C123 S
# 4 0 373450 8.0500 NaN S
# .. ... ... ... ... ...
# 885 5 382652 29.1250 NaN Q
# 886 0 211536 13.0000 NaN S
# 887 0 112053 30.0000 B42 S
# 889 0 111369 30.0000 C148 C
# 890 0 370376 7.7500 NaN Q
# [714 rows x 12 columns]
# ^^^^^^^^^^^查找^^^^^^^^^^^^^
# 28.0
# 1
排序
## 排序
import pandas as pd
passanger_data = pd.read_csv('titanic/train.csv')
new_passanger_survival = passanger_data.sort_values("Age",ascending=False) # 按年龄来降序排序
print(new_passanger_survival)
passanger_reindex = new_passanger_survival.reset_index(drop=True) # 把index值(索引值)按降序规则来来重新排序
print('--------------------------')
print(passanger_reindex.loc[0:10]) # \ 表示换行显示
# PassengerId Survived Pclass Name \
# 630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
# 851 852 0 3 Svensson, Mr. Johan
# 493 494 0 1 Artagaveytia, Mr. Ramon
# 96 97 0 1 Goldschmidt, Mr. George B
# 116 117 0 3 Connors, Mr. Patrick
# .. ... ... ... ...
# 859 860 0 3 Razi, Mr. Raihed
# 863 864 0 3 Sage, Miss. Dorothy Edith "Dolly"
# 868 869 0 3 van Melkebeke, Mr. Philemon
# 878 879 0 3 Laleff, Mr. Kristo
# 888 889 0 3 Johnston, Miss. Catherine Helen "Carrie"
# Sex Age SibSp Parch Ticket Fare Cabin Embarked
# 630 male 80.0 0 0 27042 30.0000 A23 S
# 851 male 74.0 0 0 347060 7.7750 NaN S
# 493 male 71.0 0 0 PC 17609 49.5042 NaN C
# 96 male 71.0 0 0 PC 17754 34.6542 A5 C
# 116 male 70.5 0 0 370369 7.7500 NaN Q
# .. ... ... ... ... ... ... ... ...
# 859 male NaN 0 0 2629 7.2292 NaN C
# 863 female NaN 8 2 CA. 2343 69.5500 NaN S
# 868 male NaN 0 0 345777 9.5000 NaN S
# 878 male NaN 0 0 349217 7.8958 NaN S
# 888 female NaN 1 2 W./C. 6607 23.4500 NaN S
# [891 rows x 12 columns]
# --------------------------
# PassengerId Survived Pclass Name Sex \
# 0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
# 1 852 0 3 Svensson, Mr. Johan male
# 2 494 0 1 Artagaveytia, Mr. Ramon male
# 3 97 0 1 Goldschmidt, Mr. George B male
# 4 117 0 3 Connors, Mr. Patrick male
# 5 673 0 2 Mitchell, Mr. Henry Michael male
# 6 746 0 1 Crosby, Capt. Edward Gifford male
# 7 34 0 2 Wheadon, Mr. Edward H male
# 8 55 0 1 Ostby, Mr. Engelhart Cornelius male
# 9 281 0 3 Duane, Mr. Frank male
# 10 457 0 1 Millet, Mr. Francis Davis male
# Age SibSp Parch Ticket Fare Cabin Embarked
# 0 80.0 0 0 27042 30.0000 A23 S
# 1 74.0 0 0 347060 7.7750 NaN S
# 2 71.0 0 0 PC 17609 49.5042 NaN C
# 3 71.0 0 0 PC 17754 34.6542 A5 C
# 4 70.5 0 0 370369 7.7500 NaN Q
# 5 70.0 0 0 C.A. 24580 10.5000 NaN S
# 6 70.0 1 1 WE/P 5735 71.0000 B22 S
# 7 66.0 0 0 C.A. 24579 10.5000 NaN S
# 8 65.0 0 1 113509 61.9792 B30 C
# 9 65.0 0 0 336439 7.7500 NaN Q
# 10 65.0 0 0 13509 26.5500 E38 S
自定义函数
import pandas as pd
passanger_data = pd.read_csv('titanic/train.csv')
# this function returns the bundbreath(第一百行) item from a series
def hundredth_row(column):
# Extract the hundredth item
hundredth_items = column.loc[99] # start form 0
return hundredth_items
# Return the hundredth item from each column
hundredth_row = passanger_data.apply(hundredth_row) # apply 用来申请执行
print(hundredth_row)
print()
# 返回每一个缺失值的个数
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = passanger_data.apply(not_null_count)
print(column_null_count)
#
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return 'Unknown'
elif pclass == 1:
return 'First Class'
elif pclass == 2:
return 'Second Class'
elif pclass == 3:
return 'Third Class'
classes = passanger_data.apply(which_class,axis = 1) # axis ?? means what??
print(classes)
print()
def is_minor(row):
if row['Age'] < 18:
return True
else:
return False
minors = passanger_data.apply(is_minor,axis = 1)
# print minors
def generate_age_label(row):
age = row['Age']
if pd.isnull(age):
return 'Unknown'
elif age < 18:
return 'minor'
else:
return 'adult'
age_labels = passanger_data.apply(generate_age_label,axis = 1)
print(age_labels)
print('--------------')
passanger_data['age_labels'] = age_labels
age_group_survival = passanger_data.pivot_table(index="age_labels",values="Survived") #默认求各年龄获救的平局值
print(age_group_survival)
# PassengerId 100
# Survived 0
# Pclass 2
# Name Kantor, Mr. Sinai
# Sex male
# Age 34
# SibSp 1
# Parch 0
# Ticket 244367
# Fare 26
# Cabin NaN
# Embarked S
# dtype: object
# PassengerId 0
# Survived 0
# Pclass 0
# Name 0
# Sex 0
# Age 177
# SibSp 0
# Parch 0
# Ticket 0
# Fare 0
# Cabin 687
# Embarked 2
# dtype: int64
# 0 Third Class
# 1 First Class
# 2 Third Class
# 3 First Class
# 4 Third Class
# ...
# 886 Second Class
# 887 First Class
# 888 Third Class
# 889 First Class
# 890 Third Class
# Length: 891, dtype: object
# 0 adult
# 1 adult
# 2 adult
# 3 adult
# 4 adult
# ...
# 886 adult
# 887 adult
# 888 Unknown
# 889 adult
# 890 adult
# Length: 891, dtype: object
# --------------
# Survived
# age_labels
# Unknown 0.293785
# adult 0.381032
# minor 0.539823
Series
## Import the Series object from pandas
## ???
from pandas import Series
passanger_data = pd.read_csv('titanic/train.csv')
series_files = passanger_data['Name'] # 其中的一列
Passanger_name = series_files.values # series该列里面的值
# print(type(Passanger_name))
# print(Passanger_name)
print('---------------')
series_rt = passanger_data['Ticket']
rt_ticket = series_rt.values
# print(rt_scores)
series_custom = Series(rt_ticket,index = Passanger_name) # 用名字当索引
# series_custom[['Odahl, Mr. Nils Martin','Jonkoff, Mr. Lalio']] # 打印 这和下面大打印只能存在一个,如都存在,这一个不会显示
print('################')
fiveten = series_custom[10:20]
print(fiveten)
## 排序
print('\n\n--------排序-------------\n')
series_files = passanger_data['PassengerId'] # 其中的一列
Passanger_id = series_files.values # series该列里面的值
series_age = passanger_data['Age']
Passanger_age = series_age.values
series_custom1 = Series(Passanger_age,index = Passanger_id) # 把id当索引
original_index = series_custom1.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom1.reindex(sorted_index)
print(sorted_by_index)
print('----按index(键)排序---------')
sc2 = series_custom1.sort_index() # 按键排序
print(sc2)
print('\n----按values(值)排序---------') # 按值排序
sc3 = series_custom1.sort_values()
print(sc3)
print('===========================')
## 数学运算
# The value in a Series object are treated as an ndayyat,the core data type in Numpy
import numpy as np
# Add each value with each other
print(np.add(series_custom1,series_custom1)) # 值一样这之间相加,值不一样则对应相加????
# Apply sin function to each other
#
# np.sine(series_custom)
#
# module 'numpy' has no attribute 'sine'
# Return the hightest value (vill return a single value not a Series)
np.max(series_custom1)
# will actually return a Series object with a boolean value for each ticket
# > 50 返回一些true/false值
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50] # 拿true/false来返回值
series_one = series_custom > 50
series_two = series_custom < 75
both_criteria = series_custom[series_one & series_two]
print(both_criteria)
print('-=-=-=-=-=-=-=-=不同票价求平均值=-=-=-=-=-n\n\n\n\n\n')
# data alignment same index
# 不同票价求平均值
tt_critics = Series(passanger_data['ticket'].values,index=passanger_data['PassengerId'])
tt_users = Series(passanger_data['ticket2'].values,index=passanger_data['PassengerId'])
tt_mean= (tt_critics + tt_users) /2
print(tt_mean)
# -------1--------
# #######2########
# Sandstrom, Miss. Marguerite Rut 10
# Bonnell, Miss. Elizabeth 11
# Saundercock, Mr. William Henry 12
# Andersson, Mr. Anders Johan 13
# Vestrom, Miss. Hulda Amanda Adolfina 14
# Hewlett, Mrs. (Mary D Kingcome) 15
# Rice, Master. Eugene 16
# Williams, Mr. Charles Eugene 17
# Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele) 18
# Masselmani, Mrs. Fatima 19
# dtype: int64
# --------排序-------------
# 1 22.0
# 2 38.0
# 3 26.0
# 4 35.0
# 5 35.0
# ...
# 887 27.0
# 888 19.0
# 889 NaN
# 890 26.0
# 891 32.0
# Length: 891, dtype: float64
# ----按index(键)排序---------
# 1 22.0
# 2 38.0
# 3 26.0
# 4 35.0
# 5 35.0
# ...
# 887 27.0
# 888 19.0
# 889 NaN
# 890 26.0
# 891 32.0
# Length: 891, dtype: float64
# ----按values(值)排序---------
# 804 0.42
# 756 0.67
# 645 0.75
# 470 0.75
# 79 0.83
# ...
# 860 NaN
# 864 NaN
# 869 NaN
# 879 NaN
# 889 NaN
# Length: 891, dtype: float64
# ===========================
# 1 44.0
# 2 76.0
# 3 52.0
# 4 70.0
# 5 70.0
# ...
# 887 54.0
# 888 38.0
# 889 NaN
# 890 52.0
# 891 64.0
# Length: 891, dtype: float64
# -=-=-=-=-=-=-=-=-=-=-=-=-=-n
# Nosworthy, Mr. Richard Cater 51
# Harper, Mrs. Henry Sleeper (Myna Haxtun) 52
# Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson) 53
# Ostby, Mr. Engelhart Cornelius 54
# Woolner, Mr. Hugh 55
# Rugg, Miss. Emily 56
# Novel, Mr. Mansouer 57
# West, Miss. Constance Mirium 58
# Goodwin, Master. William Frederick 59
# Sirayanian, Mr. Orsen 60
# Icard, Miss. Amelie 61
# Harris, Mr. Henry Birkhardt 62
# Skoog, Master. Harald 63
# Stewart, Mr. Albert A 64
# Moubarek, Master. Gerios 65
# Nye, Mrs. (Elizabeth Ramell) 66
# Crease, Mr. Ernest James 67
# Andersson, Miss. Erna Alexandra 68
# Kink, Mr. Vincenz 69
# Jenkin, Mr. Stephen Curnow 70
# Goodwin, Miss. Lillian Amy 71
# Hood, Mr. Ambrose Jr 72
# Chronopoulos, Mr. Apostolos 73
# Bing, Mr. Lee 74
# dtype: int64
# -=-=-=-=-=-=-=-=-=-=-=-=-=-n
# PassengerId
# 1 5.0
# 2 5.5
# 3 6.0
# 4 6.5
# 5 7.0
# ...
# 887 448.0
# 888 448.5
# 889 449.0
# 890 449.5
# 891 450.0
# Length: 891, dtype: float64
import pandas as pd
# will return a new DataFrame that is indexed by the values in the specified column
# and will drop that cloumn from the DataFrame
# without the PannengerId dropped
# DataFrame来指定一个索引值
passenger_data = pd.read_csv('titanic/train.csv')
print(type(passanger_data) )
passenger_ticket = passanger_data.set_index('Name',drop=False) # 把ticket当成一个索引
print(passenger_ticket.index) # 打印index 值
#
# 目前怀疑是数据的问题,一下索引都失败了
# 具体问题详
print('\n\n\n\n=========================')
#Slice using either bracket notation or loc[]
passenger_data["Moran,Mr.James":"Sandstrom,Miss.Marguerite Rut"]
# Specific ticiket
passanger_data.loc["Moran,Mr.James":"Sandstrom,Miss.Marguerite Rut"]
# Select list of movies
tickets = ["Sandstrom,Miss.Marguerite Rut","Moran,Mr.James","Rice,Master.Eugene"]
passenger_data.loc[tickets]
# <class 'pandas.core.frame.DataFrame'>
# Index(['Braund, Mr. Owen Harris',
# 'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
# 'Heikkinen, Miss. Laina',
# 'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
# 'Allen, Mr. William Henry', 'Moran, Mr. James',
# 'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
# 'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
# 'Nasser, Mrs. Nicholas (Adele Achem)',
# ...
# 'Markun, Mr. Johann', 'Dahlberg, Miss. Gerda Ulrika',
# 'Banfield, Mr. Frederick James', 'Sutehall, Mr. Henry Jr',
# 'Rice, Mrs. William (Margaret Norton)', 'Montvila, Rev. Juozas',
# 'Graham, Miss. Margaret Edith',
# 'Johnston, Miss. Catherine Helen "Carrie"', 'Behr, Mr. Karl Howell',
# 'Dooley, Mr. Patrick'],
# dtype='object', name='Name', length=891)
# =========================
# ---------------------------------------------------------------------------
# TypeError Traceback (most recent call last)
# <ipython-input-31-6c844267ac99> in <module>
# 18 print('\n\n\n\n=========================')
# 19 #Slice using either bracket notation or loc[]
# ---> 20 passenger_data["PassengerId":"ticket"]
# 21
# 22 # Specific ticiket
# F:\Software\PYTHON\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
# 2959
# 2960 # Do we have a slicer (on rows)?
# -> 2961 indexer = convert_to_index_sliceable(self, key)
# 2962 if indexer is not None:
# 2963 return self._slice(indexer, axis=0)
# F:\Software\PYTHON\lib\site-packages\pandas\core\indexing.py in convert_to_index_sliceable(obj, key)
# 2356 idx = obj.index
# 2357 if isinstance(key, slice):
# -> 2358 return idx._convert_slice_indexer(key, kind="getitem")
# 2359
# 2360 elif isinstance(key, str):
# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _convert_slice_indexer(self, key, kind)
# 3188 if self.is_integer() or is_index_slice:
# 3189 return slice(
# -> 3190 self._validate_indexer("slice", key.start, kind),
# 3191 self._validate_indexer("slice", key.stop, kind),
# 3192 self._validate_indexer("slice", key.step, kind),
# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _validate_indexer(self, form, key, kind)
# 5069 pass
# 5070 elif kind in ["iloc", "getitem"]:
# -> 5071 self._invalid_indexer(form, key)
# 5072 return key
# 5073
# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
# 3338 "cannot do {form} indexing on {klass} with these "
# 3339 "indexers [{key}] of {kind}".format(
# -> 3340 form=form, klass=type(self), key=key, kind=type(key)
# 3341 )
# 3342 )
# TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [PassengerId] of <class 'str'>
类型转换
## 类型转换
# The apply() method in Pandas allows us to specify Python logic
# The apply() method requires you to pass in a vectorized operation
# that can be applied over each Series object.
import numpy as np
import pandas as pd
passenger_data = pd.read_csv('titanic/train.csv')
#returns the data types as a Series
types = passenger_data.dtypes
print(types)
# filter data types to just floats,index attributes returns just column names
float_columns = types[types.values == 'int64'].index
#use bracket notation to filter columns to just float columns
float_df = passenger_data[float_columns]
print(float_df)
# 'x' is a Series object representing a column
deviations =float_df.apply(lambda x: np.std(x))
print('--------------------------')
print(deviations)
print('\n\n命名函数lambda')
rt_mt_user = float_df[['ticket','ticket2']]
rt_mt_user.apply(lambda x:np.std(x),axis=1) #对每一个指标算标准差
# PassengerId int64
# Survived int64
# Pclass int64
# Name object
# Sex object
# Age float64
# SibSp int64
# Parch int64
# Ticket object
# Fare float64
# Cabin object
# Embarked object
# ticket int64
# ticket2 int64
# dtype: object
# PassengerId Survived Pclass SibSp Parch ticket ticket2
# 0 1 0 3 1 0 0 10
# 1 2 1 1 1 0 1 10
# 2 3 1 3 0 0 2 10
# 3 4 1 1 1 0 3 10
# 4 5 0 3 0 0 4 10
# .. ... ... ... ... ... ... ...
# 886 887 0 2 0 0 886 10
# 887 888 1 1 0 0 887 10
# 888 889 0 3 1 2 888 10
# 889 890 1 1 0 0 889 10
# 890 891 0 3 0 0 890 10
# [891 rows x 7 columns]
# --------------------------
# PassengerId 257.209383
# Survived 0.486319
# Pclass 0.835602
# SibSp 1.102124
# Parch 0.805605
# ticket 257.209383
# ticket2 0.000000
# dtype: float64
# 命名函数lambda
# 0 5.0
# 1 4.5
# 2 4.0
# 3 3.5
# 4 3.0
# ...
# 886 438.0
# 887 438.5
# 888 439.0
# 889 439.5
# 890 440.0
# Length: 891, dtype: float64