接外包,有相关需求的可以联系我:Telegram | Email

Pandas

该文章创建(更新)于10/8/2019,请注意文章的时效性!

pandas

数据处理库

读取文件

pandas.read_csv('xxxx.csv')

import pandas as pd
m = pd.read_csv('m0.csv')
print(type(m))
print(m.dtypes)
# 字符值为object

# print(help(pd.read_csv))
# <class 'pandas.core.frame.DataFrame'>
# noteid        int64
# notebook     object
# username     object
# date         object
# dtype: object

显示数据

m.head()  #把数据显示出来,默认显示前五条
# m.head(3) #显示前3条
# m.tail(4) # 显示后四行
# print(m.columns) # 显示列的指标
# print(m.shape) # 查看数据维度(m,n)=》表示总共有m个样本,每个样本有n个指标

取数据

# location
### 取第一个数据
print(m.loc[0])
print('----------')
print(m.loc[3])

### 取从3开始到6结束
m.loc[3:6]

### 取1,3,5
id = {1,3,5}
m.loc[id]

### 一列一列取
col = m['noteid'] # 列名来定义,如无则默认为第一行为列名

print(col)

### 取多列
col = ['noteid','username']
data = m[col]
print(data)

# 取以什么来结尾的列
# 比如找以(kg)结尾的

import pandas as pd
m = pd.read_csv('m0.csv')

col_names = m.columns.tolist()  #把当前列名变成list
print(col_names)

gram_columns = []

for c in col_names:
    if c.endswith("(kg)"):    # 以(kg)结尾
        gram_columns.append(c)  #追加

gram_def = m[gram_columns]

print(gram_def.head(3))

# noteid                0
#  school               A
# username          adads
# height(cm)          160
# weight(kg)           40
# date          2019/8/27
# Name: 0, dtype: object
# ----------
# noteid                3
#  school               D
# username      zcvxvzxcv
# height(cm)          163
# weight(kg)           43
# date          2019/8/30
# Name: 3, dtype: object

运算

# 四则运算
div_m = m['height(cm)'] / 100
print(div_m)

# 0     1.60
# 1     1.61
# 2     1.62
# 3     1.63
# 4     1.64
# 5     1.65
# 6     1.66
# 7     1.67
# 8     1.68
# 9     1.69
# 10    1.70
# Name: height(cm), dtype: float64

新加一列

# 相同维度的运算
import pandas as pd
m = pd.read_csv('m0.csv')
data = m['height(cm)'] * m['weight(kg)']
print(data)
print(data.shape)
print('======新加一列======')
height_m = m['height(cm)'] / 100
m['height(m)'] = height_m  # 是添加了,但并未写入文件
print(m['height(m)'])   
print(m)
# 0     6400
# 1     6601
# 2     6804
# 3     7009
# 4     7216
# 5     7425
# 6     7636
# 7     7849
# 8     8064
# 9     8281
# 10    8500
# dtype: int64
# (11,)
# ======新加一列======
# 0     1.60
# 1     1.61
# 2     1.62
# 3     1.63
# 4     1.64
# 5     1.65
# 6     1.66
# 7     1.67
# 8     1.68
# 9     1.69
# 10    1.70
# Name: height(m), dtype: float64
#     noteid     school   username  height(cm)  weight(kg)       date  height(m)
# 0        0          A      adads         160          40  2019/8/27       1.60
# 1        1          B    sdfsadf         161          41  2019/8/28       1.61
# 2        2          C    sdfasdf         162          42  2019/8/29       1.62
# 3        3          D  zcvxvzxcv         163          43  2019/8/30       1.63
# 4        4          E     sdfasf         164          44  2019/8/31       1.64
# 5        5     支持v在v在     必胜德国法国         165          45   2019/9/1       1.65
# 6        6      在v秩序册     在v出租车v         166          46   2019/9/2       1.66
# 7        7     支持v在v从    在v从中选出v         167          47   2019/9/3       1.67
# 8        8  v自行车v自行车v     在v出租车v         168          48   2019/9/4       1.68
# 9        9      在v现在v    自行车v在v从         169          49   2019/9/5       1.69
# 10      10  豆腐干豆腐干大锅饭     在v自行车v         170          50   2019/9/6       1.70

排序

import pandas as pd
m = pd.read_csv('m0.csv')

print('-------升序------')
m.sort_values("weight(kg)",inplace=True) # 默认ascending为Ture,用来升序
print(m["weight(kg)"])

print('-------降序------')

m.sort_values("weight(kg)",inplace=True,ascending=False) # 默认ascending为Ture,用来升序
print(m['weight(kg)'])

# -------升序------
# 0      40
# 1      41
# 3      43
# 4      44
# 6      46
# 7      47
# 8      48
# 9      49
# 10     50
# 5      66
# 2     888
# Name: weight(kg), dtype: int64
# -------降序------
# 2     888
# 5      66
# 10     50
# 9      49
# 8      48
# 7      47
# 6      46
# 4      44
# 3      43
# 1      41
# 0      40
# Name: weight(kg), dtype: int64

NAN的含义

pandas认为NAN为缺失值,或打印不出来的值。一般把缺失值放到最后

math、numpy、pandas NaN 判断

>> np.nan == np.nan
False
>> np.nan is np.nan
True

>> math.nan is np.nan
False
>> np.isnan(math.nan)
True

1. 判断 ndarray 中是否存在 nan

>> c = np.array([ 1.,  2., np.nan,  3.,  4.])
>> np.isnan(c)
array([False, False,  True, False, False])
  • 注意

    >> np.nan != np.nan
    True
    >> np.nan in c
    False
  • 上述方案要么返回的是一个序列,要么给出的是错误的结果。判断 numpy 下的多维数组中是否存在 nan 的简单方式:

    >> np.isnan(np.min(c))
    True
    >> np.isnan(np.sum(c))
    True
  • 因为

    >> np.min(c)
    nan
    >> np.sum(c)
    nan

2. 将 nan 填充为均值

>> c = np.array([ 1.,  2., np.nan,  3.,  4.])
>> c[np.isnan(c)] = np.mean(c[~np.nan(c)])

版权声明:本文为CSDN博主「Inside_Zhang」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。原文链接:https://blog.csdn.net/lanchunhui/article/details/80399681

NAN例子

import pandas as pd
import numpy as np
data = pd.read_csv('titanic/train.csv') # https://www.kaggle.com/c/titanic/data
data.head()

#
# 数据内容
#
# pclass 仓位等级
# SibSp 兄弟姐妹数量
# Parch 父母/子女
# Fare 船票价格
# Cabin 床仓编号/NaN无该值
# Embarked 登船地点/码头

print('-------------')

age = data["Age"]
print(age.loc[0:10]) # 读取前十个值
print('@@@@@@@@@@@@@@@@@')
age_is_null = pd.isnull(age) # 判断缺失值,false则不是,ture则是缺失值
print(age_is_null)
print('@@@@@@@@@@@@@@@@')
age_null_true = age[age_is_null] # 筛选,这里传入的为(true/false),把true的留下来
print(age_null_true)
print('@@@@@@@@@@@@@@@@')
age_null_count = len(age_null_true) # 当前长度
print(age_null_count)

# 未处理缺失值的情况
print('************未处理的情况**************')
mean_age = sum(data['Age']) / len(data['Age'])  # 有缺失值则结果为NaN,
print(mean_age)

# 处理后的情况
print('&&&&&&&&&&&&&&处理后的情况&&&&&&&&&&')
good_ages = data['Age'][age_is_null == False]
correct_mean_age = sum(good_ages) / len(good_ages)
print(correct_mean_age)

print('&&&&&&&&&&&&&&Pandas函数mean()来实现&&&&&&&&&&')
# pandas默认的函数来实现以上功能
correct_mean_age = data['Age'].mean()
print(correct_mean_age)

# 每个仓位等级的平均价格
print('每个仓位等级的平均价格')
passanger_classes = [1,2,3]
fares_by_class = {}
for this_class in passanger_classes:
    pclass_rows = data[data["Pclass"] == this_class ]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print(fares_by_class)

# pandas快速来实现
## 函数pivot_table
print('依靠函数pivot_table来实现上述功能')
passanger_survival = data.pivot_table(index="Pclass",values="Fare",aggfunc=np.mean)  # index:以谁为基准,values:index和什么的关系,aggfunc:指什么关系
print(passanger_survival)

print('================================')
# 
# 默认求平均值
#
passanger_age = data.pivot_table(index="Pclass",values="Age") # 求平均年龄,少写一个aggfun。按照默认求均值来操作
print(passanger_age)

print('++++++++++++++++++++++++++++++++')
port_stats = data.pivot_table(index="Embarked",values=["Fare","Survived"],aggfunc=np.sum) #一个量和其它两个量之间的关系
print(port_stats)

## 函数 dropna
print('==============dropna/把缺失值丢掉==================')
#specifying axis = 1 or axis="columns" will drop any columns that have null values
drop_na_columns = data.dropna(axis=1)  
print(drop_na_columns)
print('++==++')
new_passanger_survival = data.dropna(axis = 0,subset=['Age','Sex'])  # 如果这俩列有缺失值,则把有缺失值当前对应的行给去掉
print(new_passanger_survival)

print('^^^^^^^^^^^查找/定位到一个具体值^^^^^^^^^^^^^')
row_index_83_age = data.loc[83,"Age"]  # 83表示行,"Age"表示这一行"Age"这一列;一下同理
row_index_1000_pclass = data.loc[766,"Pclass"]
print(row_index_83_age)
print(row_index_1000_pclass)

# -------------
# 0     22.0
# 1     38.0
# 2     26.0
# 3     35.0
# 4     35.0
# 5      NaN
# 6     54.0
# 7      2.0
# 8     27.0
# 9     14.0
# 10     4.0
# Name: Age, dtype: float64
# @@@@@@@@@@@@@@@@@
# 0      False
# 1      False
# 2      False
# 3      False
# 4      False
#        ...  
# 886    False
# 887    False
# 888     True
# 889    False
# 890    False
# Name: Age, Length: 891, dtype: bool
# @@@@@@@@@@@@@@@@
# 5     NaN
# 17    NaN
# 19    NaN
# 26    NaN
# 28    NaN
#        ..
# 859   NaN
# 863   NaN
# 868   NaN
# 878   NaN
# 888   NaN
# Name: Age, Length: 177, dtype: float64
# @@@@@@@@@@@@@@@@
# 177
# ************未处理的情况**************
# nan
# &&&&&&&&&&&&&&处理后的情况&&&&&&&&&&
# 29.69911764705882
# &&&&&&&&&&&&&&Pandas函数mean()来实现&&&&&&&&&&
# 29.69911764705882
# 每个仓位等级的平均价格
# {1: 84.1546875, 2: 20.662183152173913, 3: 13.675550101832993}
# 依靠函数pivot_table来实现上述功能
#              Fare
# Pclass           
# 1       84.154687
# 2       20.662183
# 3       13.675550
# ================================
#               Age
# Pclass           
# 1       38.233441
# 2       29.877630
# 3       25.140620
# ++++++++++++++++++++++++++++++++
#                 Fare  Survived
# Embarked                      
# C         10072.2962        93
# Q          1022.2543        30
# S         17439.3988       217
# ==============dropna/把缺失值丢掉==================
#      PassengerId  Survived  Pclass  \
# 0              1         0       3   
# 1              2         1       1   
# 2              3         1       3   
# 3              4         1       1   
# 4              5         0       3   
# ..           ...       ...     ...   
# 886          887         0       2   
# 887          888         1       1   
# 888          889         0       3   
# 889          890         1       1   
# 890          891         0       3   

#                                                   Name     Sex  SibSp  Parch  \
# 0                              Braund, Mr. Owen Harris    male      1      0   
# 1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female      1      0   
# 2                               Heikkinen, Miss. Laina  female      0      0   
# 3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female      1      0   
# 4                             Allen, Mr. William Henry    male      0      0   
# ..                                                 ...     ...    ...    ...   
# 886                              Montvila, Rev. Juozas    male      0      0   
# 887                       Graham, Miss. Margaret Edith  female      0      0   
# 888           Johnston, Miss. Catherine Helen "Carrie"  female      1      2   
# 889                              Behr, Mr. Karl Howell    male      0      0   
# 890                                Dooley, Mr. Patrick    male      0      0   

#                Ticket     Fare  
# 0           A/5 21171   7.2500  
# 1            PC 17599  71.2833  
# 2    STON/O2. 3101282   7.9250  
# 3              113803  53.1000  
# 4              373450   8.0500  
# ..                ...      ...  
# 886            211536  13.0000  
# 887            112053  30.0000  
# 888        W./C. 6607  23.4500  
# 889            111369  30.0000  
# 890            370376   7.7500  

# [891 rows x 9 columns]
# ++==++
#      PassengerId  Survived  Pclass  \
# 0              1         0       3   
# 1              2         1       1   
# 2              3         1       3   
# 3              4         1       1   
# 4              5         0       3   
# ..           ...       ...     ...   
# 885          886         0       3   
# 886          887         0       2   
# 887          888         1       1   
# 889          890         1       1   
# 890          891         0       3   

#                                                   Name     Sex   Age  SibSp  \
# 0                              Braund, Mr. Owen Harris    male  22.0      1   
# 1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
# 2                               Heikkinen, Miss. Laina  female  26.0      0   
# 3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
# 4                             Allen, Mr. William Henry    male  35.0      0   
# ..                                                 ...     ...   ...    ...   
# 885               Rice, Mrs. William (Margaret Norton)  female  39.0      0   
# 886                              Montvila, Rev. Juozas    male  27.0      0   
# 887                       Graham, Miss. Margaret Edith  female  19.0      0   
# 889                              Behr, Mr. Karl Howell    male  26.0      0   
# 890                                Dooley, Mr. Patrick    male  32.0      0   

#      Parch            Ticket     Fare Cabin Embarked  
# 0        0         A/5 21171   7.2500   NaN        S  
# 1        0          PC 17599  71.2833   C85        C  
# 2        0  STON/O2. 3101282   7.9250   NaN        S  
# 3        0            113803  53.1000  C123        S  
# 4        0            373450   8.0500   NaN        S  
# ..     ...               ...      ...   ...      ...  
# 885      5            382652  29.1250   NaN        Q  
# 886      0            211536  13.0000   NaN        S  
# 887      0            112053  30.0000   B42        S  
# 889      0            111369  30.0000  C148        C  
# 890      0            370376   7.7500   NaN        Q  

# [714 rows x 12 columns]
# ^^^^^^^^^^^查找^^^^^^^^^^^^^
# 28.0
# 1

排序

## 排序
import pandas as pd
passanger_data = pd.read_csv('titanic/train.csv')
new_passanger_survival = passanger_data.sort_values("Age",ascending=False) # 按年龄来降序排序
print(new_passanger_survival)
passanger_reindex = new_passanger_survival.reset_index(drop=True) # 把index值(索引值)按降序规则来来重新排序
print('--------------------------')
print(passanger_reindex.loc[0:10])  # \ 表示换行显示

#      PassengerId  Survived  Pclass                                      Name  \
# 630          631         1       1      Barkworth, Mr. Algernon Henry Wilson   
# 851          852         0       3                       Svensson, Mr. Johan   
# 493          494         0       1                   Artagaveytia, Mr. Ramon   
# 96            97         0       1                 Goldschmidt, Mr. George B   
# 116          117         0       3                      Connors, Mr. Patrick   
# ..           ...       ...     ...                                       ...   
# 859          860         0       3                          Razi, Mr. Raihed   
# 863          864         0       3         Sage, Miss. Dorothy Edith "Dolly"   
# 868          869         0       3               van Melkebeke, Mr. Philemon   
# 878          879         0       3                        Laleff, Mr. Kristo   
# 888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   

#         Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
# 630    male  80.0      0      0       27042  30.0000   A23        S  
# 851    male  74.0      0      0      347060   7.7750   NaN        S  
# 493    male  71.0      0      0    PC 17609  49.5042   NaN        C  
# 96     male  71.0      0      0    PC 17754  34.6542    A5        C  
# 116    male  70.5      0      0      370369   7.7500   NaN        Q  
# ..      ...   ...    ...    ...         ...      ...   ...      ...  
# 859    male   NaN      0      0        2629   7.2292   NaN        C  
# 863  female   NaN      8      2    CA. 2343  69.5500   NaN        S  
# 868    male   NaN      0      0      345777   9.5000   NaN        S  
# 878    male   NaN      0      0      349217   7.8958   NaN        S  
# 888  female   NaN      1      2  W./C. 6607  23.4500   NaN        S  

# [891 rows x 12 columns]
# --------------------------
#     PassengerId  Survived  Pclass                                  Name   Sex  \
# 0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
# 1           852         0       3                   Svensson, Mr. Johan  male   
# 2           494         0       1               Artagaveytia, Mr. Ramon  male   
# 3            97         0       1             Goldschmidt, Mr. George B  male   
# 4           117         0       3                  Connors, Mr. Patrick  male   
# 5           673         0       2           Mitchell, Mr. Henry Michael  male   
# 6           746         0       1          Crosby, Capt. Edward Gifford  male   
# 7            34         0       2                 Wheadon, Mr. Edward H  male   
# 8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
# 9           281         0       3                      Duane, Mr. Frank  male   
# 10          457         0       1             Millet, Mr. Francis Davis  male   

#      Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
# 0   80.0      0      0       27042  30.0000   A23        S  
# 1   74.0      0      0      347060   7.7750   NaN        S  
# 2   71.0      0      0    PC 17609  49.5042   NaN        C  
# 3   71.0      0      0    PC 17754  34.6542    A5        C  
# 4   70.5      0      0      370369   7.7500   NaN        Q  
# 5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
# 6   70.0      1      1   WE/P 5735  71.0000   B22        S  
# 7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
# 8   65.0      0      1      113509  61.9792   B30        C  
# 9   65.0      0      0      336439   7.7500   NaN        Q  
# 10  65.0      0      0       13509  26.5500   E38        S  

自定义函数

import pandas as pd
passanger_data = pd.read_csv('titanic/train.csv')

# this function returns the bundbreath(第一百行) item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_items  = column.loc[99] # start form 0
    return hundredth_items

# Return the hundredth item from each column
hundredth_row = passanger_data.apply(hundredth_row)  # apply 用来申请执行
print(hundredth_row)

print()

# 返回每一个缺失值的个数
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = passanger_data.apply(not_null_count)
print(column_null_count)

# 
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return 'Unknown'
    elif pclass == 1:
        return 'First Class'
    elif pclass == 2:
        return 'Second Class'
    elif pclass == 3:
        return 'Third Class'

classes = passanger_data.apply(which_class,axis = 1)  # axis ?? means what??
print(classes)

print()

def is_minor(row):
    if row['Age'] < 18:
        return True
    else:
        return False
minors = passanger_data.apply(is_minor,axis = 1)

# print minors

def generate_age_label(row):
    age = row['Age']
    if pd.isnull(age):
        return 'Unknown'
    elif age < 18:
        return 'minor'
    else:
        return 'adult'

age_labels = passanger_data.apply(generate_age_label,axis = 1)
print(age_labels)

print('--------------')
passanger_data['age_labels'] = age_labels
age_group_survival = passanger_data.pivot_table(index="age_labels",values="Survived")  #默认求各年龄获救的平局值
print(age_group_survival)

# PassengerId                  100
# Survived                       0
# Pclass                         2
# Name           Kantor, Mr. Sinai
# Sex                         male
# Age                           34
# SibSp                          1
# Parch                          0
# Ticket                    244367
# Fare                          26
# Cabin                        NaN
# Embarked                       S
# dtype: object

# PassengerId      0
# Survived         0
# Pclass           0
# Name             0
# Sex              0
# Age            177
# SibSp            0
# Parch            0
# Ticket           0
# Fare             0
# Cabin          687
# Embarked         2
# dtype: int64
# 0       Third Class
# 1       First Class
# 2       Third Class
# 3       First Class
# 4       Third Class
#            ...     
# 886    Second Class
# 887     First Class
# 888     Third Class
# 889     First Class
# 890     Third Class
# Length: 891, dtype: object

# 0        adult
# 1        adult
# 2        adult
# 3        adult
# 4        adult
#         ...   
# 886      adult
# 887      adult
# 888    Unknown
# 889      adult
# 890      adult
# Length: 891, dtype: object
# --------------
#             Survived
# age_labels          
# Unknown     0.293785
# adult       0.381032
# minor       0.539823

Series

## Import the Series object from pandas
## ???
from pandas import Series  
passanger_data = pd.read_csv('titanic/train.csv') 
series_files = passanger_data['Name']  # 其中的一列
Passanger_name = series_files.values # series该列里面的值

# print(type(Passanger_name))
# print(Passanger_name)
print('---------------')
series_rt = passanger_data['Ticket']
rt_ticket = series_rt.values
# print(rt_scores)
series_custom = Series(rt_ticket,index = Passanger_name) # 用名字当索引
# series_custom[['Odahl, Mr. Nils Martin','Jonkoff, Mr. Lalio']] # 打印  这和下面大打印只能存在一个,如都存在,这一个不会显示
print('################')
fiveten = series_custom[10:20]
print(fiveten)

## 排序
print('\n\n--------排序-------------\n')
series_files = passanger_data['PassengerId']  # 其中的一列
Passanger_id = series_files.values # series该列里面的值

series_age = passanger_data['Age']
Passanger_age = series_age.values

series_custom1 = Series(Passanger_age,index = Passanger_id)  # 把id当索引
original_index = series_custom1.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom1.reindex(sorted_index)
print(sorted_by_index)

print('----按index(键)排序---------')
sc2 = series_custom1.sort_index()  # 按键排序 
print(sc2)
print('\n----按values(值)排序---------') # 按值排序
sc3 = series_custom1.sort_values()
print(sc3)

print('===========================')
## 数学运算
# The value in a Series object are treated as an ndayyat,the core data type in Numpy
import numpy as np
# Add each value with each other
print(np.add(series_custom1,series_custom1))  # 值一样这之间相加,值不一样则对应相加????

# Apply sin function to each other
#
# np.sine(series_custom)
#
# module 'numpy' has no attribute 'sine'

# Return the hightest value (vill return a single value not a Series)
np.max(series_custom1)

# will actually return a Series object with a boolean value for each ticket
# > 50 返回一些true/false值
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50] # 拿true/false来返回值

series_one = series_custom > 50
series_two = series_custom < 75
both_criteria = series_custom[series_one & series_two]

print(both_criteria)

print('-=-=-=-=-=-=-=-=不同票价求平均值=-=-=-=-=-n\n\n\n\n\n')

# data alignment same index
# 不同票价求平均值
tt_critics = Series(passanger_data['ticket'].values,index=passanger_data['PassengerId'])
tt_users = Series(passanger_data['ticket2'].values,index=passanger_data['PassengerId'])      
tt_mean= (tt_critics + tt_users) /2
print(tt_mean)

# -------1--------
# #######2########
# Sandstrom, Miss. Marguerite Rut                            10
# Bonnell, Miss. Elizabeth                                   11
# Saundercock, Mr. William Henry                             12
# Andersson, Mr. Anders Johan                                13
# Vestrom, Miss. Hulda Amanda Adolfina                       14
# Hewlett, Mrs. (Mary D Kingcome)                            15
# Rice, Master. Eugene                                       16
# Williams, Mr. Charles Eugene                               17
# Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)    18
# Masselmani, Mrs. Fatima                                    19
# dtype: int64

# --------排序-------------

# 1      22.0
# 2      38.0
# 3      26.0
# 4      35.0
# 5      35.0
#        ... 
# 887    27.0
# 888    19.0
# 889     NaN
# 890    26.0
# 891    32.0
# Length: 891, dtype: float64
# ----按index(键)排序---------
# 1      22.0
# 2      38.0
# 3      26.0
# 4      35.0
# 5      35.0
#        ... 
# 887    27.0
# 888    19.0
# 889     NaN
# 890    26.0
# 891    32.0
# Length: 891, dtype: float64

# ----按values(值)排序---------
# 804    0.42
# 756    0.67
# 645    0.75
# 470    0.75
# 79     0.83
#        ... 
# 860     NaN
# 864     NaN
# 869     NaN
# 879     NaN
# 889     NaN
# Length: 891, dtype: float64
# ===========================
# 1      44.0
# 2      76.0
# 3      52.0
# 4      70.0
# 5      70.0
#        ... 
# 887    54.0
# 888    38.0
# 889     NaN
# 890    52.0
# 891    64.0
# Length: 891, dtype: float64
# -=-=-=-=-=-=-=-=-=-=-=-=-=-n

# Nosworthy, Mr. Richard Cater                          51
# Harper, Mrs. Henry Sleeper (Myna Haxtun)              52
# Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)    53
# Ostby, Mr. Engelhart Cornelius                        54
# Woolner, Mr. Hugh                                     55
# Rugg, Miss. Emily                                     56
# Novel, Mr. Mansouer                                   57
# West, Miss. Constance Mirium                          58
# Goodwin, Master. William Frederick                    59
# Sirayanian, Mr. Orsen                                 60
# Icard, Miss. Amelie                                   61
# Harris, Mr. Henry Birkhardt                           62
# Skoog, Master. Harald                                 63
# Stewart, Mr. Albert A                                 64
# Moubarek, Master. Gerios                              65
# Nye, Mrs. (Elizabeth Ramell)                          66
# Crease, Mr. Ernest James                              67
# Andersson, Miss. Erna Alexandra                       68
# Kink, Mr. Vincenz                                     69
# Jenkin, Mr. Stephen Curnow                            70
# Goodwin, Miss. Lillian Amy                            71
# Hood, Mr. Ambrose Jr                                  72
# Chronopoulos, Mr. Apostolos                           73
# Bing, Mr. Lee                                         74
# dtype: int64
# -=-=-=-=-=-=-=-=-=-=-=-=-=-n

# PassengerId
# 1        5.0
# 2        5.5
# 3        6.0
# 4        6.5
# 5        7.0
#        ...  
# 887    448.0
# 888    448.5
# 889    449.0
# 890    449.5
# 891    450.0
# Length: 891, dtype: float64
import pandas as pd
# will return a new DataFrame that is indexed by the values in the specified column
# and will drop that cloumn from the DataFrame
# without the PannengerId dropped

# DataFrame来指定一个索引值

passenger_data = pd.read_csv('titanic/train.csv')
print(type(passanger_data) )
passenger_ticket = passanger_data.set_index('Name',drop=False) # 把ticket当成一个索引
print(passenger_ticket.index) # 打印index 值

#
# 目前怀疑是数据的问题,一下索引都失败了
# 具体问题详

print('\n\n\n\n=========================')
#Slice using either bracket notation or loc[]
passenger_data["Moran,Mr.James":"Sandstrom,Miss.Marguerite Rut"]

# Specific ticiket
passanger_data.loc["Moran,Mr.James":"Sandstrom,Miss.Marguerite Rut"]

# Select list of movies
tickets  = ["Sandstrom,Miss.Marguerite Rut","Moran,Mr.James","Rice,Master.Eugene"]
passenger_data.loc[tickets]
# <class 'pandas.core.frame.DataFrame'>
# Index(['Braund, Mr. Owen Harris',
#        'Cumings, Mrs. John Bradley (Florence Briggs Thayer)',
#        'Heikkinen, Miss. Laina',
#        'Futrelle, Mrs. Jacques Heath (Lily May Peel)',
#        'Allen, Mr. William Henry', 'Moran, Mr. James',
#        'McCarthy, Mr. Timothy J', 'Palsson, Master. Gosta Leonard',
#        'Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)',
#        'Nasser, Mrs. Nicholas (Adele Achem)',
#        ...
#        'Markun, Mr. Johann', 'Dahlberg, Miss. Gerda Ulrika',
#        'Banfield, Mr. Frederick James', 'Sutehall, Mr. Henry Jr',
#        'Rice, Mrs. William (Margaret Norton)', 'Montvila, Rev. Juozas',
#        'Graham, Miss. Margaret Edith',
#        'Johnston, Miss. Catherine Helen "Carrie"', 'Behr, Mr. Karl Howell',
#        'Dooley, Mr. Patrick'],
#       dtype='object', name='Name', length=891)

# =========================
# ---------------------------------------------------------------------------
# TypeError                                 Traceback (most recent call last)
# <ipython-input-31-6c844267ac99> in <module>
#      18 print('\n\n\n\n=========================')
#      19 #Slice using either bracket notation or loc[]
# ---> 20 passenger_data["PassengerId":"ticket"]
#      21 
#      22 # Specific ticiket

# F:\Software\PYTHON\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
#    2959 
#    2960         # Do we have a slicer (on rows)?
# -> 2961         indexer = convert_to_index_sliceable(self, key)
#    2962         if indexer is not None:
#    2963             return self._slice(indexer, axis=0)

# F:\Software\PYTHON\lib\site-packages\pandas\core\indexing.py in convert_to_index_sliceable(obj, key)
#    2356     idx = obj.index
#    2357     if isinstance(key, slice):
# -> 2358         return idx._convert_slice_indexer(key, kind="getitem")
#    2359 
#    2360     elif isinstance(key, str):

# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _convert_slice_indexer(self, key, kind)
#    3188             if self.is_integer() or is_index_slice:
#    3189                 return slice(
# -> 3190                     self._validate_indexer("slice", key.start, kind),
#    3191                     self._validate_indexer("slice", key.stop, kind),
#    3192                     self._validate_indexer("slice", key.step, kind),

# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _validate_indexer(self, form, key, kind)
#    5069             pass
#    5070         elif kind in ["iloc", "getitem"]:
# -> 5071             self._invalid_indexer(form, key)
#    5072         return key
#    5073 

# F:\Software\PYTHON\lib\site-packages\pandas\core\indexes\base.py in _invalid_indexer(self, form, key)
#    3338             "cannot do {form} indexing on {klass} with these "
#    3339             "indexers [{key}] of {kind}".format(
# -> 3340                 form=form, klass=type(self), key=key, kind=type(key)
#    3341             )
#    3342         )

# TypeError: cannot do slice indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [PassengerId] of <class 'str'>

类型转换

## 类型转换

# The apply() method in Pandas allows us to specify Python logic
# The apply() method requires you to pass in a vectorized operation
# that can be applied over each Series object.
import numpy as np
import pandas as pd

passenger_data = pd.read_csv('titanic/train.csv')

#returns the data types as a Series
types = passenger_data.dtypes
print(types)

# filter data types to just floats,index attributes returns just column names
float_columns = types[types.values == 'int64'].index
#use bracket notation to filter columns to just float columns
float_df = passenger_data[float_columns]
print(float_df)
# 'x' is a Series object representing a column
deviations =float_df.apply(lambda x: np.std(x))

print('--------------------------')

print(deviations)

print('\n\n命名函数lambda')
rt_mt_user = float_df[['ticket','ticket2']]
rt_mt_user.apply(lambda x:np.std(x),axis=1)  #对每一个指标算标准差
# PassengerId      int64
# Survived         int64
# Pclass           int64
# Name            object
# Sex             object
# Age            float64
# SibSp            int64
# Parch            int64
# Ticket          object
# Fare           float64
# Cabin           object
# Embarked        object
# ticket           int64
# ticket2          int64
# dtype: object
#      PassengerId  Survived  Pclass  SibSp  Parch  ticket  ticket2
# 0              1         0       3      1      0       0       10
# 1              2         1       1      1      0       1       10
# 2              3         1       3      0      0       2       10
# 3              4         1       1      1      0       3       10
# 4              5         0       3      0      0       4       10
# ..           ...       ...     ...    ...    ...     ...      ...
# 886          887         0       2      0      0     886       10
# 887          888         1       1      0      0     887       10
# 888          889         0       3      1      2     888       10
# 889          890         1       1      0      0     889       10
# 890          891         0       3      0      0     890       10

# [891 rows x 7 columns]
# --------------------------
# PassengerId    257.209383
# Survived         0.486319
# Pclass           0.835602
# SibSp            1.102124
# Parch            0.805605
# ticket         257.209383
# ticket2          0.000000
# dtype: float64

# 命名函数lambda
# 0        5.0
# 1        4.5
# 2        4.0
# 3        3.5
# 4        3.0
#        ...  
# 886    438.0
# 887    438.5
# 888    439.0
# 889    439.5
# 890    440.0
# Length: 891, dtype: float64

wanna more in wiki


要不赞赏一下?

微信
支付宝
PayPal
Bitcoin

版权声明 | Copyright

除非特别说明,本博客所有作品均采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议进行许可。转载请注明转自-
https://www.emperinter.info/2019/10/08/pandas/


要不聊聊?

我相信你准备留下的内容是经过思考的!【勾选防爬虫,未勾选无法留言】

*

*



YouTube | B站

微信公众号

👉 NewsLetter ❤️ 邮箱订阅 👈

优惠码

阿里云国际版20美元
Vultr10美元
搬瓦工 | Bandwagon应该有折扣吧?
Just My SocksJMS9272283 【注意手动复制去跳转】
域名 | namesiloemperinter(1美元)
币安 币安