按照以下方式导入pandas库:

import pandas as pd

Import

CSV

df = pd.read_csv("your_file.csv")
df.to_csv("your_file.csv")

Excel

df = pd.read_excel("your_file.xlsx", 'sheet1')
df.to_excel('your_file.xlsx', sheet_name='sheet2')

ODPS

from odps import ODPS
from odps.df import DataFrame

odps = ODPS( 'your-access-id', 'your-access-key', 'your-project', endpoint='your-endpoint')
df = DataFrame(odps.get_table('your-table', project="your-project")).to_pandas()

Preview

df.head(5)
df.tail(10)
df.columns # 列名
df.dtypes # 类型
df.describe() # 每列描述统计
df.cov() # 协方差
df.corr() # 相关系数

Manipulate

filter

df[df.SepalLength>=5] 
df[df.Species.isin(["setosa", "versicolor"])] # 根据一个list的取值范围过滤
df[(df.PetalLength>=1) & (df.SepalWidth<3)] # 根据多个条件过滤

rename

df2 = df.rename(columns={'old_columnname': 'new_columnname'})
df.rename(columns={'old_columnname': 'new_columnname'}, inplace=True)

slice

df[0:3] # 选取行

select

df.flag # 选择一列
df["flag"] # 选择一列
df[["flag", "link"]] # 选择多列

mutate

df.normal_cnt = df.total_cnt - df.abnormal_cnt

group by

import pandas as pd
import numpy as np

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
                   
grouped = df.groupby(["A", "B"])

# 选取某一个分组
grouped.get_group(('foo','one'))

# 分组迭代
for name, group in grouped:
    print name
    print group

# 分组聚合
grouped.aggregate(np.sum) # 所有列使用一个聚合函数
grouped['C'].agg([np.sum, np.mean, np.std]) # 选取一个列使用多个聚合函数
grouped.agg([np.sum, np.mean, np.std]) # 所有列使用多个聚合函数
grouped.agg({'C' : np.sum,
             'D' : lambda x: np.std(x, ddof=1)}) # 每个列定制自己的聚合函数

distinct

df['column1'].unique()

union

pd.concat([df1, df2])

join

pd.merge(df1, df2, on='column1', how='inner') # change how to ['left', 'right', 'outer']

pivot table

pd.pivot_table(df, values='column1', index=['column2', 'column3'], columns=['column4']) # 默认采用sum聚合
pd.pivot_table(df, values='column1', index=['column2', 'column3'], columns=['column4'], aggfunc=len) # 采用aggfunc自定义聚合函数  

na

df1.dropna() # 删除含na的行和列
df2.fillna(value=5) # 填充缺失值为5
mean = df2['column1'].mean()
df2['column1'].fillna(mean) # 用均值填充

apply function

by element

# map: iterate over each element of a series
df['column1'].map(lambda x: 10+x) # add 10 to each element of column1
df['column2'].map(lambda x: "AV"+x) # concatenae "AV" at the beginning of each element of column2

# applymap: iterate over each element of a data frame
df.applymap(lambda x: x+2)

by axis

df[['column1', 'column2']].apply(sum) # return the sum of all values of column1 and column2

资源