pandas-dataframe

发表于 2019-01-16 更新于 2022-03-27 分类于开源库阅读次数：本文字数： 12k 阅读时长 ≈ 11 分钟

条件筛选

riseUp = len(df[(0.0 < df.rise_percent) & (df.rise_percent < 10.0)])
result = result[(result['var']>0.25) | (result['var']<-0.25)]

count = df['value'].isna().sum()
#或者 count = df['value'].isnull().sum()
(df['value'] == 'NaN').sum()

 # 过滤行， 所有列 满足条件
df = df[(df > -2147483645.0)].dropna(how='all')
# 过滤列， 任意列 满足条件
df = df[(df > -2147483645.0)].dropna(axis=1)

遍历数据

iterrows

# 遍历列 比 df_mean.items() 效率高
for label, content in df_mean.iteritems():
    print(f'label: {label}')
    print(f'content: {content}', sep='\n')

# Iterate over DataFrame rows as (index, Series) pairs.
for idx,row in df.iterrows():
    dataX = row[0]
    dataY = row[1]

# 倒序
for idx, row in df[::-1].iterrows():
    pass

Yields
indexlabel or tuple of label
The index of the row. A tuple for a MultiIndex.

dataSeries
The data of the row as a Series.

行列选择 iloc

通过索引定位行，列，输入参数是整数类型

iloc

Selecting data by row numbers (.iloc)
Selecting data by label or by a conditional statement (.loc)

The iloc indexer syntax is data.iloc[, ],

# Single selections using iloc and DataFrame
# Rows:
data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output.
data.iloc[1] # second row of data frame (Evan Zigomalas)
data.iloc[-1] # last row of data frame (Mi Richan)
# Columns:
data.iloc[:,0] # first column of data frame (first_name)
data.iloc[:,1] # second column of data frame (last_name)
data.iloc[:,-1] # last column of data frame (id)

# Multiple row and column selections using iloc and DataFrame
data.iloc[0:5] # first five rows of dataframe
data.iloc[:, 0:2] # first two columns of data frame with all rows
data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).

行列选择 loc

a.) Selecting rows by label/index
b.) Selecting rows with a boolean / conditional lookup
The loc indexer is used with the same syntax as iloc: data.loc[, ]

# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email'
data.loc[['Andrade', 'Veness'], 'city':'email']
# Select same rows, with just 'first_name', 'address' and 'city' columns
data.loc['Andrade':'Veness', ['first_name', 'address', 'city']]
 
# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
# select the row with 'id' = 487
data.loc[487]

替换值

# True 替换原始数据，否则返回替换后的副本
df.replace(-2147483645.0,0,True)

# 替换多个值 {原值:替换后的值,原值:替换后的值}
df.replace({-2147483645.0:0,-2147483648.0:0},inplace=True)

读写文件

csv 文件

# 文件没有标题，指定 df 标题
df = pd.read_csv(r'成都西菱凸轮轴升程表FT2.5L-EX-H闭到开.txt', dtype=float,sep='\t',header=None,names=['t1','t2'])
   
# 按照 float 类型读取，默认是 str 类型
df_csv = pd.read_csv('csv_example',dtype=float,header=None)

# 要注意数据类型，有时不是数值类型，画图会有问题
print(s.dtypes)

# index 不写，否则读取的时候，会生成2份
df.to_csv('csv_example', index=False)
# 去掉标题 header
df.to_csv('refine.csv', index=False,header=False)

# 从字符串读取 csv 文件
bytes_data = shm.buf.tobytes()
s = str(bytes_data[:data_size], 'utf-8')
data = StringIO(s)
df = pd.read_csv(data)

写 csv 权限问题

# 先创建文件并打开
file = open(msg_json["Param"], "w")
df_all.to_csv(file, index=False)
file.close()

f = open(os.path.join(root, name))
datas = f.read()
data = StringIO(datas)
df = pd.read_csv(data,header=None)

from string

from io import StringIO

# 这个花时间 没有分割成list之后转换为 df 快
s = str(bytes_data, 'utf-8')
data = StringIO(s)
df = pd.read_csv(data)


# s 是 [xx,xx,xx] 列表
df = pd.read_csv(StringIO(s),delimiter=',', delim_whitespace=False, names=cols)


header_list = ["Name", "Dept", "Start Date"]
df = pd.read_csv("sample_file.csv", names=header_list)

df = pd.read_csv("sample_file.csv", header=None)
print(df)

ndarray 转 dataframe

my_array = np.array([[11,22,33],[44,55,66]])

df = pd.DataFrame(my_array, columns = ['Column_A','Column_B','Column_C'])

# 多个时，转成list 然后在一起转
xlist = x.tolist()
ylist = y.tolist()
datas = pd.DataFrame(list(zip(xlist, ylist)), columns=['x','y'], dtype=float)

series to dataframe

df = my_series.to_frame()
df = pd.DataFrame(my_series)

列操作

列数

df.shape[1]

添加列

df = pd.DataFrame(columns=['A', 'B'], data = [[1,2],[3,4]])
df['C'] = None

# 添加多列
# 尾部追加
pd.concat([df, pd.DataFrame(columns=['D','E'])])
# 排序并追加，可以保持原有列的数值
df.reindex(columns=['A','B','C','D','E'])

修改列名

# t1 改为 x
df = df.rename(columns={'t1': 'x'})

删除列

# inplace 是否替换输入的df
df.drop(col_names_list, axis=1, inplace=True)

获取某一列

# 类型是 series
col = df["YY"]
# 返回的是一个DataFrame
col = df[["YY"]]

过滤列

[] 是 boolean 操作符，保留 true 的数据

aqicsv[aqicsv["predictaqi_norm1"]>100]
aqicsv[(aqicsv["FID"]>37898) & (aqicsv["FID"]<38766) ]

列是否存在

if 'ma5' not in datas or 'ma60' not in datas:
    datas = calc_ma(datas, ["5", "10", "25", "43", "60"])

行操作

行数

df.shape[0]

行变列

df_row.T
# 副本
df_t_copy = df_.T.copy()

df_tr = df.transpose()

删除行

df = df.drop([1,2,3]])

获取第一行索引

df_idx = df.loc[[0]].index.tolist()[0]
# 如果索引不是从 0 开始的，获取第一行
dd = df_calc.iloc[0:1]

遍历行

# 这个快些
counts = len(df)
for idx in range(0,counts):
    if df.at[idx,'code'] > x1 and sindex == 0:
        sindex = df.loc[[idx]].index.tolist()[0]
    elif df.at[idx,'code'] >= x2 and sindex != 0:
        eindex = df.loc[[idx]].index.tolist()[0]
        df = df.loc[sindex:eindex]
        df = df.reset_index(drop=True)
        return df
        
# 该方法较慢
for index, row in df.iterrows():
    print(index, row['ts_code'], row['trade_date'])
    df_row = df[index:index + 1]
    print(df_row)

获取一行

df.iloc[0]   #返回的是Series
df.iloc[[0]]  #如果在里面多加一个方括号，那么返回的是DataFrame

# 最后一行
dd = df_calc.iloc[-1:]

#iloc只能用数字索引，不能用索引名
data.iloc[-1]   #选取DataFrame最后一行，返回的是Series
data.iloc[-1:]   #选取DataFrame最后一行，返回的是DataFrame

获取连续多行

df.loc[1:4]

选取特定行的数据

# 按索引标签选取（loc做法）
df_name.loc[["Ivysaur","VenusaurMega Venusaur","Charizard","Squirtle"]]
# 按索引位置选取（iloc做法）
df_name.iloc[[1,3,6,9]]

添加一行数据

# angle is a list
df_result.loc[0] = angle

new_row = {'name':'Geo', 'physics':87, 'chemistry':92, 'algebra':97}
df_marks = df_marks.append(new_row, ignore_index=True)

# 添加多行数据到 dataframe
df = pd.DataFrame([[1, 2], [3, 4]], columns = ["a", "b"])
print(df)
OUTPUT
   a  b
0  1  2
1  3  4

删除行

df = df.drop([[1,2,3]])

list,dict to dataframe

labels = ['date','open', 'high', 'low', 'close', 'volume', 'code']
# angle is a list
datas = pd.DataFrame([angle], columns=label, dtype=float)
datas.set_index('date', inplace=True)

# 多个 list 合并为 dataframe
labels = ['角度', '数值']
datas = pd.DataFrame(list(zip(numbers[1], numbers[6]), columns=labels))

# data_arr is list of dict [{},{}...]
df = pd.DataFrame(data_arr)
  
# initialize list of lists  
data = [['Geeks', 10], ['for', 15], ['geeks', 20]]  
  
# Create the pandas DataFrame  
df = pd.DataFrame(data, columns = ['Name', 'Age'])  
  
# print dataframe.  
print(df )

dataframe to list of dict

codes = df_codes.to_dict('records')

判断空

if df.empty:
    return None

nan None Null

from numpy import NaN
NaN是numpy\pandas下的，不是Python原生的，Not a Number的简称。
数据类型是float

None是一个python特殊的数据类型。
None不同于空列表和空字符串，是一种单独的格式。

print(type(None))
NoneType


if per_data.loc[0, 'volume_ratio'] is None:
    per_data.loc[0, 'volume_ratio'] = NaN

isnull()
notnull()
dropna(): 过滤丢失数据
fillna(): 填充丢失数据

过滤nan

# 按行过滤
df = df.dropna(axis=0)
# 按列过滤
df = df.dropna(axis=1)

# 按行过滤, 所有值都是 na 的行
df = df.dropna(axis=0,how='all')

填充 NaN

# 使用填充最后一个数值的方式进行列对齐
df_all = df_all.fillna(method='pad')

重建索引

drop=True 就不会多出来一列了

datas = datas.reset_index(drop=True)

获取索引

sindex = df.loc[[idx]].index.tolist()[0]

判断是否相等

# 给出多个值
print((chk_height == y_standard))

# 给出一个值 True or False
print((chk_height==y_standard).all())

生成测试数据

# start, stop, num=生成点个数
x_standard_new = np.linspace(0, 360, 720)

获取重复值

# 重复项索引
df_dup1 = df['value'].duplicated()
df_dup = df_value[df_value['value'].duplicated()]

去除重复项

DataFrame.drop_duplicates(subset=None, keep='first', inplace=False)

df_nodup = df.drop_duplicates(subset='data',keep='first',inplace=False)

去掉连续重复项

cols = ['code', 'data']
de_dup = it[cols].loc[(it[cols].shift() != it[cols]).any(axis=1)]

修改数据精度

df = df.round(2)

删除指定列中含有指定字符的行

# 删除某列包含特殊字符的行
datas = datas[~datas['name'].str.contains('ST')]
codes = codes[~codes['ts_code'].str.startswith('688')]

合并

append 会创建新的对象，所以效率会差些
通常使用 dataframe list 保存数据，然后使用 concat() 函数一次性合并

https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html#concatenating-using-append

# 这个是追加行合并
df_all = pd.concat(df_list, ignore_index=True)

# 合并列
df_all = pd.concat(share_df_list,axis=1)

排序

df.sort_values('trade_date', ascending=True, inplace=True)
results = t_codes.find().sort('trade_date', pymongo.DESCENDING).limit(1)

inplace 表示是否修改当前的数据，false的话，当前数据不变，返回修改后的

打印属性设置

pd.set_option('display.width', 5000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.float_format = '{:,.2f}'.format

with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 500,
                        'display.width', 5000):
    print(df)

pd.options.display.float_format = '{:.6f}'.format
pd.set_option('precision', 6)

数据平移 df.shift

# 整个表上下移动（相当于在表第一行插入一空白行，但是最后一行由于没有 index ，就消失了）
print(df.shift(1)) # 下移 1 行
print('\n')
print(df.shift(-2)) # 上移 2 行
print('\n')

# 左右平移
print(df.shift(1,axis=1)) # 右移 1 行，数据格式不兼容则显示 NaN
print('\n')
print(df.shift(-2,axis=1)) # 左移 2 行，数据格式不兼容则显示 NaN
print('\n')

计算

# https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
# 优先使用 loc 
df.loc[:, 'y'] = df.loc[:, 'y'] / 100


df = pd.DataFrame({'C1': [1, 1, 1],
                   'C2': [1, 1, 1],
                   'C3': [1, 1, 1]})
df
#    C1  C2  C3
# 0   1   1   1
# 1   1   1   1
# 2   1   1   1

df + 1
#    C1  C2  C3
# 0   2   2   2
# 1   2   2   2
# 2   2   2   2

df['C1'] = df['C1'] + np.array([1, 2, 3])
df
#    C1  C2  C3
# 0   2   1   1
# 1   3   1   1
# 2   4   1   1

df.iloc[2, 2] += 5
df
#    C1  C2  C3
# 0   2   1   1
# 1   3   1   1
# 2   4   1   6

df[['C1', 'C2']] -= 5
df
#    C1  C2  C3
# 0  -3  -4   1
# 1  -2  -4   1
# 2  -1  -4   6

格式化输出

格式化输出
print tabulate(df.head(5), headers='keys', tablefmt='psql')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
#     pd.set_option('display.height', 1000)
pd.options.display.float_format = '{:,.2f}'.format

创建空的dataframe

# 创建一个空的 DataFrame
df_empty = pd.DataFrame(columns=['A', 'B', 'C', 'D'])

清空 dataframe

df_tip = df_tip.iloc[0:0]

获取指定行列数据

DataFrame.at
Access a single value for a row/column label pair.
print(data.at[0, 'turnover_rate'])
print(data.at[1, 'turnover_rate'])
## 指定行
data.irow(0)   #取data的第一行
data.icol(0)   #取data的第一列

# 第一行 t1 数据
df.iloc[[0]]['t1'].to_list()[0]
# 最后一行 t1 数据
df.iloc[-1:]['t1'].to_list()[0]

行，列统计和

# 计算第5列的累加和
column_sum = lt100.iloc[:, 5].sum()

返回行数，列数

# 返回列数
df.shape[1]
# 返回行数
len(df)

最大值，最小值，均值

# 列的最大，最小值
print(datas.loc[:, "角度1"].max())
print(datas.loc[:, "角度1"].min())

highestLine = klines.ix[klines['close'].idxmax()]
lowestLine = klines.ix[klines['close'].idxmin()]

# Average for each column: return seriers
df.mean(axis=0)
# Average for each row:
df.mean(axis=1)

# 获取最大值的一行
maxIdx = df_fourier['y'].idxmax()
maxRow = df_fourier.iloc[[maxIdx]]

差值 diff

# 计算 column 每 5 个数值的差
df_mean = df_row.diff(axis=1,periods=5)

rolling

# 一个数据一个数据的移动
df = pd.DataFrame({'B': [0, 1, 2, 3, 4,5,6]})
print(df.rolling(2).sum())

# 返回多个聚合结果，如sum()、mean()
df2.rolling(window=2, min_periods=1)["amount"].agg([np.sum, np.mean])
      sum   mean
0   12000.0 12000.0
1   30000.0 15000.0
2   18000.0 18000.0
3   12000.0 12000.0
4   21000.0 10500.0
5   25000.0 12500.0
6   34000.0 17000.0

# rolling by row data
df_mean = df_row.rolling(window=5, axis=1).mean()

切片

# 获取 0,1行，所有列
df1.loc[[0, 1], :]

df[0:]	#第0行及之后的行，相当于df的全部数据，注意冒号是必须的
df[:2]	#第2行之前的数据（不含第2行）
df[0:1]	#第0行
df[1:3] #第1行到第2行（不含第3行）
df[-1:] #最后一行
df[-3:-1] #倒数第3行到倒数第1行


# df.loc[index, column_name],选取指定行和列的数据
df.loc[0,'name'] # 'Snow'
df.loc[0:2, ['name','age']] 		 #选取第0行到第2行，name列和age列的数据, 注意这里的行选取是包含下标的。
df.loc[[2,3],['name','age']] 		 #选取指定的第2行和第3行，name和age列的数据
df.loc[df['gender']=='M','name'] 	 #选取gender列是M，name列的数据
df.loc[df['gender']=='M',['name','age']] #选取gender列是M，name和age列的数据

df.iloc[0,0]		#第0行第0列的数据，'Snow'
df.iloc[1,2]		#第1行第2列的数据，32
df.iloc[[1,3],0:2]	#第1行和第3行，从第0列到第2列（不包含第2列）的数据
df.iloc[1:3,[1,2]	#第1行到第3行（不包含第3行），第1列和第2列的数据