- , , . ~ 40 , ~ 1,8 , ~ 20 /. 10K, ~ 40MB . . , .
(, -ascii- UTF-8), . , , . , ~ 40 ~ 20 , . , , , .
Python, . , , ( ascii-):
import random
from os.path import getsize
file_name = 'file.csv'
selection_count = 10000
file_size = getsize(file_name)
with open(file_name) as file:
file.readline()
line_size = file.tell()
line_count = file_size // line_size
selection_indices = random.sample(range(line_count), selection_count)
selection_indices.sort()
prev_index = 0
for line_index in selection_indices:
if line_index != prev_index + 1:
file.seek(line_index * line_size)
print('Line #{}: {}'.format(line_index, file.readline()), end='')
prev_index = line_index
() , . . , , 40 . - , CSV unic ascii unicode, UTF-8, , . , , , :
import random
from os.path import getsize
file_name = 'file.csv'
selection_count = 10000
max_line_bytes = 40000
file_size = getsize(file_name)
selection_offsets = make_offsets(selection_count, file_size, max_line_bytes)
with open(file_name, 'rb') as file:
for offset in selection_offsets:
file.seek(offset)
file.readline()
print(file.readline().decode('utf-8'), end='')
- Python 3.