# -*- coding:Utf-8 -*- # # LineCount003.py # # Source : # # http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python # from __future__ import with_statement import time import mmap import random from collections import defaultdict from itertools import (takewhile,repeat) def mapcount(filename): f = open(filename, "r+") buf = mmap.mmap(f.fileno(), 0) lines = 0 readline = buf.readline while readline(): lines += 1 return lines def simplecount(filename): lines = 0 for line in open(filename): lines += 1 return lines def bufcount(filename): f = open(filename) lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) while buf: lines += buf.count('\n') buf = read_f(buf_size) return lines def opcount(fname): with open(fname) as f: for i, l in enumerate(f): pass return i + 1 def _make_gen(reader): b = reader(1024 * 1024) while b: yield b b = reader(1024*1024) def rawgencount(filename): f = open(filename, 'rb') f_gen = _make_gen(f.raw.read) return sum( buf.count(b'\n') for buf in f_gen ) def rawcount(filename): f = open(filename, 'rb') lines = 0 buf_size = 1024 * 1024 read_f = f.raw.read buf = read_f(buf_size) # Ajout de la détection des fichiers vides de CountLines # Empty file if not buf: return 0 while buf: lines += buf.count(b'\n') buf = read_f(buf_size) return lines #from itertools import (takewhile,repeat) #Reporté en début def rawincount(filename): f = open(filename, 'rb') bufgen = takewhile(lambda x: x, (f.raw.read(1024*1024) for _ in repeat(None))) return sum( buf.count(b'\n') for buf in bufgen ) # Similaire à rawcount ci-dessus def CountLines(filename): f = open(filename) try: lines = 0 buf_size = 1024 * 1024 read_f = f.read # loop optimization buf = read_f(buf_size) # Empty file if not buf: return 0 while buf: lines += buf.count('\n') buf = read_f(buf_size) return lines finally: f.close() # # # counts = defaultdict(list) for i in range(9): for func in [mapcount, simplecount, bufcount, opcount, rawgencount, rawcount, rawincount, CountLines]: nbLi = 0 start_time = time.time() nbLi = func("BIG_FILE.csv") counts[func].append(time.time() - start_time) counts[func].append(nbLi) for key, vals in counts.items(): print (key.__name__, "\t : ", sum(vals) / float(len(vals)))