[3] | 1 | #!/usr/bin/perl |
---|
| 2 | use strict; |
---|
| 3 | use warnings; |
---|
| 4 | use Getopt::Std; |
---|
| 5 | |
---|
| 6 | sub parse_command_line(); |
---|
| 7 | sub load_word_list(); |
---|
| 8 | sub compile_regex(@); |
---|
| 9 | sub usage(); |
---|
| 10 | |
---|
| 11 | my $word_list_file; |
---|
| 12 | my $input_file ; |
---|
| 13 | my $output_file; |
---|
| 14 | my $find_complete_words ; |
---|
| 15 | my $find_inverse; |
---|
| 16 | my $find_in_specific_column ; |
---|
| 17 | my $find_case_insensitive ; |
---|
| 18 | my $skip_first_line ; |
---|
| 19 | |
---|
| 20 | |
---|
| 21 | ## |
---|
| 22 | ## Program Start |
---|
| 23 | ## |
---|
| 24 | usage() if @ARGV==0; |
---|
| 25 | parse_command_line(); |
---|
| 26 | |
---|
| 27 | my @words = load_word_list(); |
---|
| 28 | |
---|
| 29 | my $regex = compile_regex(@words); |
---|
| 30 | |
---|
| 31 | # Allow first line to pass without filtering? |
---|
| 32 | if ( $skip_first_line ) { |
---|
| 33 | my $line = <$input_file>; |
---|
| 34 | print $output_file $line ; |
---|
| 35 | } |
---|
| 36 | |
---|
| 37 | |
---|
| 38 | ## |
---|
| 39 | ## Main loop |
---|
| 40 | ## |
---|
| 41 | while ( <$input_file> ) { |
---|
| 42 | my $target = $_; |
---|
| 43 | |
---|
| 44 | |
---|
| 45 | # If searching in a specific column (and not in the entire line) |
---|
| 46 | # extract the content of that one column |
---|
| 47 | if ( $find_in_specific_column ) { |
---|
| 48 | my @columns = split ; |
---|
| 49 | |
---|
| 50 | #not enough columns in this line - skip it |
---|
| 51 | next if ( @columns < $find_in_specific_column ) ; |
---|
| 52 | |
---|
| 53 | $target = $columns [ $find_in_specific_column - 1 ] ; |
---|
| 54 | } |
---|
| 55 | |
---|
| 56 | # Match ? |
---|
| 57 | if ( ($target =~ $regex) ^ ($find_inverse) ) { |
---|
| 58 | print $output_file $_ ; |
---|
| 59 | } |
---|
| 60 | } |
---|
| 61 | |
---|
| 62 | close $input_file; |
---|
| 63 | close $output_file; |
---|
| 64 | |
---|
| 65 | ## |
---|
| 66 | ## Program end |
---|
| 67 | ## |
---|
| 68 | |
---|
| 69 | |
---|
| 70 | sub parse_command_line() |
---|
| 71 | { |
---|
| 72 | my %opts ; |
---|
| 73 | getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n"; |
---|
| 74 | |
---|
| 75 | die "$0: missing word-list file name\n" if (@ARGV==0); |
---|
| 76 | |
---|
| 77 | $word_list_file = $ARGV[0]; |
---|
| 78 | die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ; |
---|
| 79 | |
---|
| 80 | $find_complete_words = ( exists $opts{w} ) ; |
---|
| 81 | $find_inverse = ( exists $opts{v} ) ; |
---|
| 82 | $find_case_insensitive = ( exists $opts{i} ) ; |
---|
| 83 | $skip_first_line = ( exists $opts{s} ) ; |
---|
| 84 | |
---|
| 85 | |
---|
| 86 | # Search in specific column ? |
---|
| 87 | if ( defined $opts{c} ) { |
---|
| 88 | $find_in_specific_column = $opts{c}; |
---|
| 89 | |
---|
| 90 | die "$0: invalid column number ($find_in_specific_column).\n" |
---|
| 91 | unless $find_in_specific_column =~ /^\d+$/ ; |
---|
| 92 | |
---|
| 93 | die "$0: invalid column number ($find_in_specific_column).\n" |
---|
| 94 | if $find_in_specific_column <= 0; |
---|
| 95 | } |
---|
| 96 | else { |
---|
| 97 | $find_in_specific_column = 0 ; |
---|
| 98 | } |
---|
| 99 | |
---|
| 100 | |
---|
| 101 | # Output File specified (instead of STDOUT) ? |
---|
| 102 | if ( defined $opts{o} ) { |
---|
| 103 | my $filename = $opts{o}; |
---|
| 104 | open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ; |
---|
| 105 | } else { |
---|
| 106 | $output_file = *STDOUT ; |
---|
| 107 | } |
---|
| 108 | |
---|
| 109 | |
---|
| 110 | |
---|
| 111 | # Input file Specified (instead of STDIN) ? |
---|
| 112 | if ( @ARGV>1 ) { |
---|
| 113 | my $filename = $ARGV[1]; |
---|
| 114 | open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ; |
---|
| 115 | } else { |
---|
| 116 | $input_file = *STDIN; |
---|
| 117 | } |
---|
| 118 | } |
---|
| 119 | |
---|
| 120 | sub load_word_list() |
---|
| 121 | { |
---|
| 122 | open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ; |
---|
| 123 | my @words ; |
---|
| 124 | while ( <WORDLIST> ) { |
---|
| 125 | chomp ; |
---|
| 126 | s/^\s+//; |
---|
| 127 | s/\s+$//; |
---|
| 128 | next if length==0; |
---|
| 129 | push @words,quotemeta $_; |
---|
| 130 | } |
---|
| 131 | close WORDLIST; |
---|
| 132 | |
---|
| 133 | die "$0: Error: word-list file '$word_list_file' is empty!\n" |
---|
| 134 | unless @words; |
---|
| 135 | |
---|
| 136 | return @words; |
---|
| 137 | } |
---|
| 138 | |
---|
| 139 | sub compile_regex(@) |
---|
| 140 | { |
---|
| 141 | my @words = @_; |
---|
| 142 | |
---|
| 143 | my $regex_string = join ( '|', @words ) ; |
---|
| 144 | if ( $find_complete_words ) { |
---|
| 145 | $regex_string = "\\b($regex_string)\\b"; |
---|
| 146 | } |
---|
| 147 | my $regex; |
---|
| 148 | |
---|
| 149 | if ( $find_case_insensitive ) { |
---|
| 150 | $regex = qr/$regex_string/i ; |
---|
| 151 | } else { |
---|
| 152 | $regex = qr/$regex_string/; |
---|
| 153 | } |
---|
| 154 | |
---|
| 155 | return $regex; |
---|
| 156 | } |
---|
| 157 | |
---|
| 158 | sub usage() |
---|
| 159 | { |
---|
| 160 | print <<EOF; |
---|
| 161 | |
---|
| 162 | Word-List Grep |
---|
| 163 | Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu ) |
---|
| 164 | |
---|
| 165 | Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE] |
---|
| 166 | |
---|
| 167 | -s - do not filter first line - always output the first line from the input file. |
---|
| 168 | -w - search for complete words (not partial sub-strings). |
---|
| 169 | -i - case insensitive search. |
---|
| 170 | -v - inverse - output lines NOT matching the word list. |
---|
| 171 | -c N - check only column N, instead of entire line (line split by whitespace). |
---|
| 172 | -o OUT - specify output file (default = STDOUT). |
---|
| 173 | WORD-LIST-FILE - file containing one word per line. These will be used |
---|
| 174 | for the search. |
---|
| 175 | INPUT-FILE - (optional) read from file (default = from STDIN). |
---|
| 176 | |
---|
| 177 | |
---|
| 178 | |
---|
| 179 | EOF |
---|
| 180 | |
---|
| 181 | exit; |
---|
| 182 | } |
---|