Skip to content

19 csv文件处理

CSV(Comma-Separated Values)是最常见的数据格式之一——Excel导出、数据库备份、日志导出都用它。Python的csv模块就是专门处理CSV的。

一、读取CSV

1.1 csv.reader()

python
import csv

# 读取CSV文件
with open("data.csv", "r", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

每行返回一个字符串列表:

['name', 'age', 'city']
['大志', '28', '北京']
['小明', '25', '上海']

重要:打开文件时必须指定newline=''

python
import csv

with open("data.csv", "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    for row in reader:
        print(row)

1.2 跳过表头

python
import csv

with open("data.csv", "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    header = next(reader)  # 跳过表头
    print(f"列名: {header}")
    
    for row in reader:
        print(row)

1.3 转换为列表

python
import csv

with open("data.csv", "r", newline="", encoding="utf-8") as f:
    reader = csv.reader(f)
    rows = list(reader)

print(rows)
# [['name', 'age', 'city'], ['大志', '28', '北京'], ['小明', '25', '上海']]

二、写入CSV

2.1 csv.writer()

python
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["name", "age", "city"])      # 写一行
    writer.writerow(["大志", 28, "北京"])
    writer.writerows([                               # 写多行
        ["小明", 25, "上海"],
        ["小红", 23, "广州"]
    ])

2.2 自定义分隔符

python
import csv

# 使用制表符分隔
with open("data.tsv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter="\t")
    writer.writerow(["name", "age", "city"])
    writer.writerow(["大志", 28, "北京"])

2.3 自定义引号

python
import csv

# 所有字段都加引号
with open("output.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL)
    writer.writerow(["name", "age", "city"])
    writer.writerow(["大志", 28, "北京"])

引号常量:

常量说明
QUOTE_ALL所有字段都加引号
QUOTE_MINIMAL只在需要时加引号(默认)
QUOTE_NONNUMERIC非数字字段加引号
QUOTE_NONE不加引号

三、字典模式

3.1 DictReader

每行返回一个字典,键是表头。

python
import csv

with open("data.csv", "r", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row)

输出:

{'name': '大志', 'age': '28', 'city': '北京'}
{'name': '小明', 'age': '25', 'city': '上海'}

访问更方便:

python
import csv

with open("data.csv", "r", newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(f"{row['name']}, {row['age']}岁, 在{row['city']}")

3.2 DictWriter

python
import csv

with open("output.csv", "w", newline="", encoding="utf-8") as f:
    fieldnames = ["name", "age", "city"]
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    writer.writeheader()  # 写表头
    writer.writerow({"name": "大志", "age": 28, "city": "北京"})
    writer.writerows([
        {"name": "小明", "age": 25, "city": "上海"},
        {"name": "小红", "age": 23, "city": "广州"}
    ])

四、Dialect方言

4.1 预定义方言

python
import csv

# excel方言(默认)
print(csv.list_dialects())  # ['excel', 'excel-tab', 'unix']

# 使用unix方言(用\n换行)
with open("data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, dialect="unix")
    writer.writerow(["name", "age"])
    writer.writerow(["大志", 28])

4.2 自定义方言

python
import csv

# 注册自定义方言
csv.register_dialect("custom", 
    delimiter="|",
    quotechar="'",
    quoting=csv.QUOTE_MINIMAL
)

with open("data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, dialect="custom")
    writer.writerow(["name", "age", "city"])
    writer.writerow(["大志", 28, "北京"])

五、Sniffer:自动检测格式

5.1 检测分隔符

python
import csv

# 自动检测CSV格式
with open("data.csv", "r", encoding="utf-8") as f:
    sample = f.read(1024)  # 读取样本
    
sniffer = csv.Sniffer()
dialect = sniffer.sniff(sample)
print(f"分隔符: {dialect.delimiter}")
print(f"引号符: {dialect.quotechar}")

5.2 检测是否有表头

python
import csv

with open("data.csv", "r", encoding="utf-8") as f:
    sample = f.read(1024)

sniffer = csv.Sniffer()
has_header = sniffer.has_header(sample)
print(f"有表头: {has_header}")

六、处理特殊字符

6.1 包含逗号的字段

python
import csv
from io import StringIO

# 字段包含逗号时,自动加引号
output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "description"])
writer.writerow(["大志", "喜欢Python, LangChain"])

print(output.getvalue())
# name,description
# 大志,"喜欢Python, LangChain"

6.2 包含换行的字段

python
import csv
from io import StringIO

output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "bio"])
writer.writerow(["大志", "第一行\n第二行"])

print(output.getvalue())
# name,bio
# 大志,"第一行
# 第二行"

6.3 包含引号的字段

python
import csv
from io import StringIO

output = StringIO()
writer = csv.writer(output)
writer.writerow(["name", "quote"])
writer.writerow(["大志", '他说"Hello"'])

print(output.getvalue())
# name,quote
# 大志,"他说""Hello"""

七、实战场景

7.1 数据转换

python
import csv

# CSV转JSON
import json

def csv_to_json(csv_file, json_file):
    with open(csv_file, "r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        data = list(reader)
    
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

csv_to_json("data.csv", "data.json")

7.2 数据过滤

python
import csv

# 过滤CSV数据
def filter_csv(input_file, output_file, condition):
    with open(input_file, "r", newline="", encoding="utf-8") as fin, \
         open(output_file, "w", newline="", encoding="utf-8") as fout:
        
        reader = csv.DictReader(fin)
        writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)
        writer.writeheader()
        
        for row in reader:
            if condition(row):
                writer.writerow(row)

# 过滤年龄大于25的记录
filter_csv("data.csv", "filtered.csv", lambda r: int(r["age"]) > 25)

7.3 合并CSV文件

python
import csv
from pathlib import Path

def merge_csv_files(input_dir, output_file):
    files = list(Path(input_dir).glob("*.csv"))
    
    with open(output_file, "w", newline="", encoding="utf-8") as fout:
        writer = None
        
        for filepath in files:
            with open(filepath, "r", newline="", encoding="utf-8") as fin:
                reader = csv.DictReader(fin)
                
                if writer is None:
                    writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)
                    writer.writeheader()
                
                for row in reader:
                    writer.writerow(row)

merge_csv_files("./data", "merged.csv")

八、总结

csv模块的核心:

类/函数用途
csv.reader()读取CSV,返回列表
csv.writer()写入CSV
csv.DictReader()读取CSV,返回字典
csv.DictWriter()写入CSV,接收字典
csv.Sniffer()自动检测CSV格式

使用要点:

  • 打开文件时加newline=''
  • 指定encoding='utf-8'
  • 包含逗号/换行的字段会自动加引号
  • DictReader/DictWriter更方便访问字段

CSV虽然简单,但用得非常多。掌握csv模块,数据导入导出就轻松了。