python对大文件的处理
多线程框架中采取queue来实现线程间资源的互斥。
在文件过大的情况下,如果都读入内存的话,占用内存就太多了。
这里手动实现了一个多线程调用文件迭代器来使用f.next()
# -*- coding:utf-8 -*-
import threading
class Geturl(object):
def __init__(self, open_file):
self.open_file = open_file
self.num = 0
self.__mutex = threading.RLock()
self.f = open(self.open_file, 'r')
self.kafka_mutex = threading.RLock()
def _line(self):
self.__mutex.acquire()
try:
line = self.f.__next__()
except StopIteration:
line = StopIteration
self.f.close()
'''
if self.num % 1000 == 0:
print(self.num)
self.num += 1
'''
self.__mutex.release()
return line
def get_line(self):
return self._line()
def _deal(deal_file):
while True:
try:
item = deal_file.get_line()
# 处理工作
if item == StopIteration:
raise ValueError()
except ValueError:
print("all task has done!")
break
except Exception as e:
print("error:", e)
if __name__ == "__main__":
filename = 'Bigfile'
geturl = Geturl(filename)
_deal(geturl)