02.python面试之文件操作

发表于 2021-11-10 更新于 2023-02-21 分类于 rd ， python Changyan：本文字数： 2.3k 阅读时长 ≈ 8 分钟

Python中有几个内置模块和方法来处理文件。这些方法被分割到例如os, os.path , shutil 和 pathlib 等等几个模块中。文章将列举Python中对文件最常用的操作和方法相关的面试问题。

文件操作

有一个jsonline格式的文件file.txt大小约为10K

def get_lines():
    with open('file.txt','rb') as f:
        return f.readlines()
if __name__ == '__main__':
    for e in get_lines():
        process(e) # 处理每一行数据

现在要处理一个大小为10G的文件，但是内存只有4G，如果在只修改get_lines 函数而其他代码保持不变的情况下，应该如何实现？需要考虑的问题都有那些？

可考虑分片读取的方式进行处理，需要考虑内存不足以一次性读取文件内容

# 使用生成器
def get_content():
    with open('statics/lines.txt', 'r', encoding='utf8') as f:
        while True:
            data = f.readlines(4)
            if data:
                yield data
            else:
                break
for i in get_content():
    print(i)

['11111111111111111111111111111\n']
['22222222222222222222222222222\n']
['333333333333333333333333333333\n']
['4444444444444444444444444\n']
['555555555555555555555555555\n']
['666666666666666666666666666666666\n']
['77777777777777777777']

# 高手答案
from mmap import mmap
def get_content(file_path):
    with open(file_path,"r+") as f:
        m = mmap(f.fileno(), 0)
        tmp = 0
        for i, char in enumerate(m):
            if char==b"\n":
                yield m[tmp:i+1].decode()
                tmp = i+1
for i in get_content('statics/lines.txt'):
    print(i)

11111111111111111111111111111

22222222222222222222222222222

333333333333333333333333333333

4444444444444444444444444

555555555555555555555555555

666666666666666666666666666666666

补充缺失的代码

def print_directory_contents(sPath):
"""
这个函数接收文件夹的名称作为输入参数
返回该文件夹中文件的路径
以及其包含文件夹中文件的路径
"""

import os
# 存放文件的列表
lis = []
# 普通方式，使用递归
def print_directory_contents(sPath):

    # 遍历目录
    for path in os.listdir(sPath):
        full_path = os.path.join(sPath, path)
        if os.path.isfile(full_path):
            lis.append(full_path)
        else:
            print_directory_contents(full_path)
print_directory_contents('./aaa')
print(lis)

['./aaa\\bbb\\bbb.txt', './aaa\\bbb\\bbbb\\bbbb.txt', './aaa\\ccc.txt']

# 使用os.walk()
def print_directory_contents(sPath):
    lis = []
    for dirpath, dirnames, filenames in os.walk(sPath):
        for filename in filenames:
            lis.append(os.path.join(dirpath, filename))
    return lis
print(lis)

['./aaa\\bbb\\bbb.txt', './aaa\\bbb\\bbbb\\bbbb.txt', './aaa\\ccc.txt']

00011.r、r+、rb、rb+文件打开模式区别

r ：以只读方式打开文件，文件指针会放在文件的开头
r+ ：打开一个文件用于读写，文件指针将会放在文件的开头
rb ：以二进制格式打开一个文件用于只读，文件指针将会放在文件开头，一般用于非文本文件如图片等
rb+：以二进制格式打开一个文件用于读写，文件指针将会放在文件开头，一般用于非文本文件如图片等
w ：打开一个永健用于写入，如果该文件存在则打开文件，并从头开始编辑，即原有内容会被删除，如果文件不存在则创建新文件用于写入
w+ ：打开一个文件用于读写，如果该文件存在则打开文件，并从头开始编辑，即原有内容会被删除，如果文件不存在则创建新文件用于写入
wb ：以二进制格式打开一个文件只用于写入，如果该文件存在则打开文件，并从头开始编辑，即原有内容会被删除，如果文件不存在则创建新文件用于写入，一般用于非文本文件如图片等
wb+：以二进制格式打开一个文件用于读写，如果该文件存在则打开文件，并从头开始编辑，即原有内容会被删除，如果文件不存在则创建新文件用于写入，一般用于非文本文件如图片等
a ：打开一个文件用于追加，如果该文件已存在，文件指针将会放在文件结尾，也就是说，新内容会被写入到已有内容之后，如果该文件不存在，则创建新文件进行写入
a+ ：打开一个文件用于读写，如果该文件已存在，文件指针将会放在文件结尾，文件打开时会是追加模式，如果该文件不存在，创建新文件用于读写
ab ：以二进制格式打开一个文件用于追加，如果文件已存在，文件指针将会放在文件结尾，也就是说新的内容会被写入到已有内容之后，如果该文件不存在，创建新文件进行写入
ab+：以二进制格式打开一个文件用于追加，如果文件已存在，文件指针将会放在文件结尾，如果该文件不存在，则创建新文件用于读写
x ：写模式，新建一个文件，如果该文件已存在则会报错

00012.请写一个Python逻辑，计算一个文件中的大写字母数量

with open('statics/en.txt', 'r', encoding="utf8") as f:
    content = f.read()
    upper_lens = 0
    for char in content:
        if char.isupper():
            upper_lens +=1
    print(upper_lens)

00013.如何用Python找出你目前在哪个目录？

1
2
3

import os
print(os.getcwd())

F:\projects\python_exam

00014.如何以相反顺序展示一个文件的内容？

1
2
3

with open("statics/en.txt", encoding="utf8", mode='r') as f:
    content = f.read()
    print(content[::-1])

.tsissa eht rof ydaer si troppuS lacinhceT ™namuH lautcA ruo ,srebircsbus orP rof dnA .noitatnemucod wollof-ot-ysae htiw uoy tops ot ereht thgir er'ew ,tfil reivaeh a htiw sedargpu roF .detcepxe sa "krow tsuj" dluohs gnihtyreve ,sedargpu tsom htiW

.rehtegotla 5 noisrev gnidliub tneps ew emit fo tnuoma eht slavir ytiliba-edargpu s'6 noisrev otni tup ew troffe fo tnuoma eht ,tcaf nI .htooms repus eb dluohs sgniht ,morf gnimoc er'uoy noisrev hcihw rettam on oS .4v dna 5v morf gnidargpu ysae ni tliub ev'ew taht troper ot dalg er'ew ,6 emosewA tnoF fo esaeler yraurbeF eht ot resolc hcni ew sA

.bulC egakaerB oN eht denioj dna syaw ruo degnahc ev’ew neht ecniS .taht tuoba yrros ylerecnis er’eW !hoD .4 noisrev htiw ytilibitapmoc ekorb ew nehw ,5 noisrev htiw yaw drah eht nossel ruo denrael eW

.ssem laer a s’ti ,haeY ?deb eht oop-oop ,onnud I ,ho ,ti evah ot ylno erawtfos wen ot dedargpu reve uoy evaH

00015.python的read() 、readline()、readlines()、xreadlines()

read([size])：从文件读取指定字节数，如果为给定或者为负则读取所有
readline([size])：读取整行，size表示读取该行的长度
readlines([sizeint])：读取所有行并返回列表，如果给定sizeint>0，返回综合大约为sizeint字节的行，实际读取值可能比sizeint较大，因为需要填充缓冲区。

import random
with open('statics/lines.txt', 'r', encoding="utf8") as f:
    while True:
        # 逐行读取指定长度的该行内容(如果该行未读取完毕，则下次readline继续读取该行剩余内容，直到读完)
        line = f.readline(random.randint(1, 100))
        if not line:
            break
        else:
            print(line)

11111111111111111111111111111

2222222222222222222222
2222222

333333333333333333333333333333

444444444444444444444444
4

555555555555555
555555555555

6666666666666666666
66666666666666

7777777777777777777
7

1
2
3

with open('statics/lines.txt', 'r', encoding="utf8") as f:
    # 返回长度为30的行内容，一行不足30，则读取下一行
    print(f.readlines(30))

['11111111111111111111111111111\n', '22222222222222222222222222222\n']

00016.使用代码实现查看列举目录下的所有文件

# 默认显示当前目录下的文件和文件夹
os.listdir()
# 显示指定相对路径下文件和文件夹
os.listdir('./statics')
# 显示指定绝对路径下文件和文件夹
os.listdir('F:/projects/python_exam/aaa')

['bbb', 'ccc.txt']

00017.如何在Python中删除文件？

op.remove(path)：删除路径为path的文件，如果path是文件夹，则抛出OSError。
op.removedirs(path)：递归删除目录
op.rmdir(path)：删除path指定的空目录，如果目录非空，则爬出OSError错误。

1
2
3

import os
# 删除文件
print(os.remove('F:/projects/python_exam/aaa.txt'))

None

1 2	# 删除空文件夹 print(os.rmdir('F:/projects/python_exam/ccc'))

None

1 2	# 递归删除目录 print(os.removedirs('./bbb'))

None

00018.设计实现遍历目录与子目录，抓取.pyc文件

# os.walk用以生成所要查找的目录及其子目录下的所有文件。
for dirpath, dirnames, filenames in os.walk(r'F:/projects/celery_test'):
    for pyc in filenames:
        if pyc.endswith(".pyc"):
            print(os.path.join(dirpath, pyc))

F:/projects/celery_test\__pycache__\celery_task.cpython-37.pyc

import os
# 递归方式二
def get_pyc(path):
    for filepath in os.listdir(path):
        file_full_path = os.path.join(path, filepath)
        if os.path.isfile(file_full_path):
            if filepath.endswith(".pyc"):
                print(file_full_path)
        else:
            get_pyc(file_full_path)
get_pyc(r'F:/projects/celery_test')

F:/projects/celery_test\__pycache__\celery_task.cpython-37.pyc