| 1 | #!/usr/bin/env python |
|---|
| 2 | # Kanwei Li, 2010 |
|---|
| 3 | # Selects N random lines from a file and outputs to another file |
|---|
| 4 | |
|---|
| 5 | import random, sys |
|---|
| 6 | |
|---|
| 7 | def main(): |
|---|
| 8 | infile = open(sys.argv[1], 'r') |
|---|
| 9 | total_lines = int(sys.argv[2]) |
|---|
| 10 | |
|---|
| 11 | if total_lines < 1: |
|---|
| 12 | sys.stderr.write( "Must select at least one line." ) |
|---|
| 13 | sys.exit() |
|---|
| 14 | |
|---|
| 15 | kept = [] |
|---|
| 16 | n = 0 |
|---|
| 17 | for line in infile: |
|---|
| 18 | line = line.rstrip("\n") |
|---|
| 19 | n += 1 |
|---|
| 20 | if (n <= total_lines): |
|---|
| 21 | kept.append(line) |
|---|
| 22 | elif random.randint(1, n) <= total_lines: |
|---|
| 23 | kept.pop(random.randint(0, total_lines-1)) |
|---|
| 24 | kept.append(line) |
|---|
| 25 | |
|---|
| 26 | if n < total_lines: |
|---|
| 27 | sys.stderr.write( "Error: asked to select more lines than there were in the file." ) |
|---|
| 28 | sys.exit() |
|---|
| 29 | |
|---|
| 30 | open(sys.argv[3], 'w').write( "\n".join(kept) ) |
|---|
| 31 | |
|---|
| 32 | if __name__ == "__main__": |
|---|
| 33 | main() |
|---|