Random sampling of lines from a file

I have a csv file that is ~ 40 GB and 1800000 lines.

I want to randomly test 10,000 lines and print them in a new file.

Right now, my approach is to use sed as:

(sed -n '$vars' < input.txt) > output.txt

Where $varsis a random list of strings. (For example: 1p; 14p; 1700p; ...; 10203p)

While this works, it takes 5 minutes to complete. This is not a great time, but I was wondering if anyone has any ideas on how to do this faster?

+6
source share
6 answers

- , , . ~ 40 , ~ 1,8 , ~ 20 /. 10K, ~ 40MB . . , .

(, -ascii- UTF-8), . , , . , ~ 40 ~ 20 , . , , , .

Python, . , , ( ascii-):

import random
from os.path import getsize

# Input file path
file_name = 'file.csv'
# How many lines you want to select
selection_count = 10000

file_size = getsize(file_name)
with open(file_name) as file:
    # Read the first line to get the length
    file.readline()
    line_size = file.tell()
    # You don't have to seek(0) here: if line #0 is selected,
    # the seek will happen regardless later.

    # Assuming you are 100% sure all lines are equal, this might
    # discard the last line if it doesn't have a trailing newline.
    # If that bothers you, use `math.round(file_size / line_size)`
    line_count = file_size // line_size
    # This is just a trivial example of how to generate the line numbers.
    # If it doesn't work for you, just use the method you already have.
    # By the way, this will just error out (ValueError) if you try to
    # select more lines than there are in the file, which is ideal
    selection_indices = random.sample(range(line_count), selection_count)
    selection_indices.sort()

    # Now skip to each line before reading it:
    prev_index = 0
    for line_index in selection_indices:
        # Conveniently, the default seek offset is the start of the file,
        # not from current position
        if line_index != prev_index + 1:
            file.seek(line_index * line_size)
        print('Line #{}: {}'.format(line_index, file.readline()), end='')
        # Small optimization to avoid seeking consecutive lines.
        # Might be unnecessary since seek probably already does
        # something like that for you
        prev_index = line_index

() , . . , , 40 . - , CSV unic ascii unicode, UTF-8, , . , , , :

import random
from os.path import getsize

# Input file path
file_name = 'file.csv'
# How many lines you want to select
selection_count = 10000
# An upper bound on the line size in bytes, not chars
# This serves two purposes:
#   1. It determines the margin to use from the end of the file
#   2. It determines the closest two offsets are allowed to be and
#      still be 100% guaranteed to be in different lines
max_line_bytes = 40000

file_size = getsize(file_name)
# make_offset is a function that returns `selection_count` monotonically
# increasing unique samples, at least `max_line_bytes` apart from each
# other, in the range [0, file_size - margin). Implementation not provided.
selection_offsets = make_offsets(selection_count, file_size, max_line_bytes)
with open(file_name, 'rb') as file:
    for offset in selection_offsets:
        # Skip to each offset
        file.seek(offset)
        # Readout to the next full line
        file.readline()
        # Print the next line. You don't know the number.
        # You also have to decode it yourself.
        print(file.readline().decode('utf-8'), end='')

- Python 3.

+5

, , dd.

, wc -l, , , , , . wc , .

, 20000

#!/bin/bash

for i in `shuf -n 10000 -i 0-1799999 | sort -n`
do
    dd if=file bs=20000 skip="$i" count=1 of=output status=none \
        oflag=append conv=notrunc
done

, 10K, , , , dd , , Python seek(), ( @tripleee @Mad ) .

#!/usr/bin/python3
import random

randoms = random.sample(range(0, 1800000), 10000)
randoms.sort()

lsize = 20000

with open("file", "rb") as infile, open('output', 'wb') as outfile:
    for n in randoms:
        infile.seek(lsize * n)
        outfile.write(infile.read(lsize))

, , .

with open("file", "rb") as infile, open('output', 'wb') as outfile:
    buf = bytearray()
    for n in randoms:
        infile.seek(lsize * n)
        buf.extend(infile.read(lsize))
    outfile.write(buf)
+2

, Python script seek() , , , .

Python script, sed script, . , 123p sed, 122 * , .

, Python 3 , ( , ). script ( , Unicode, , Python, ).

+1

1 800 000 :

$ awk 'BEGIN {for (i=1; i<=1800000; i++) print "line " i}' >file
$ ls -l file
-rw-r--r--  1 dawg  wheel  22288896 Jan  1 09:41 file

, , POSIX wc:

$ time wc -l file
 1800000 file

real    0m0.018s
user    0m0.012s
sys 0m0.004s

, 1 800 000 .

, , awk :

#!/bin/bash

lc=($(wc -l file))
awk -v lc="$lc" -v c=10000 '
BEGIN{srand()}
int(lc*rand())<=c{print; i++}
i>=c{exit}
' file >rand_lines

200 iMac. , 10 000, , , , , 10 000 .

10 000 , :

awk -v lc="$lc" -v c=10000 '
BEGIN{srand()}
int(lc*rand())<c * (1.01 or a factor to make sure that 10,000 is hit before EOF) {print; i++}
i>=c{exit}
' file >rand_lines

, , 10 000 1 :

awk -v lc="$lc" -v c=10000 '
BEGIN{srand()
      while (i<c) {
        x=int(lc*rand())
        if (x in rl) continue  # careful if c is larger than or close to lc
        else {
        rl[x]
        i++} 
        }
     }
NR in rl' file >rand_lines
+1

(, sqlite mysql), SQL

select * from your_table where id in (1, 14, 1700, ...)

, http://jan.kneschke.de/projects/mysql/order-by-rand/

script, , , . , . , , . , .

. , 40- .

0

, -, , . , 10k 180k, . 10/180, . 10/180, . , .

, , 10k . , .

0

All Articles