root/galaxy-central/tools/unix_tools/word_list_grep.pl

リビジョン 3, 3.8 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/perl
2use strict;
3use warnings;
4use Getopt::Std;
5
6sub parse_command_line();
7sub load_word_list();
8sub compile_regex(@);
9sub usage();
10
11my $word_list_file;
12my $input_file ;
13my $output_file;
14my $find_complete_words ;
15my $find_inverse;
16my $find_in_specific_column ;
17my $find_case_insensitive ;
18my $skip_first_line ;
19
20
21##
22## Program Start
23##
24usage() if @ARGV==0;
25parse_command_line();
26
27my @words = load_word_list();
28
29my $regex = compile_regex(@words);
30
31# Allow first line to pass without filtering?
32if ( $skip_first_line ) {
33        my $line = <$input_file>;
34        print $output_file $line ;
35}
36
37
38##
39## Main loop
40##
41while ( <$input_file> ) {
42        my $target = $_;
43
44
45        # If searching in a specific column (and not in the entire line)
46        # extract the content of that one column
47        if ( $find_in_specific_column ) {
48                my @columns = split ;
49
50                #not enough columns in this line - skip it
51                next if ( @columns < $find_in_specific_column ) ;
52
53                $target = $columns [ $find_in_specific_column - 1 ] ;
54        }
55
56        # Match ?
57        if ( ($target =~ $regex) ^ ($find_inverse) ) {
58                print $output_file $_ ;
59        }
60}
61
62close $input_file;
63close $output_file;
64
65##
66## Program end
67##
68
69
70sub parse_command_line()
71{
72        my %opts ;
73        getopts('siwvc:o:', \%opts) or die "$0: Invalid option specified\n";
74
75        die "$0: missing word-list file name\n" if (@ARGV==0);
76
77        $word_list_file = $ARGV[0];
78        die "$0: Word-list file '$word_list_file' not found\n" unless -e $word_list_file ;
79
80        $find_complete_words = ( exists $opts{w} ) ;
81        $find_inverse = ( exists $opts{v} ) ;
82        $find_case_insensitive = ( exists $opts{i} ) ;
83        $skip_first_line = ( exists $opts{s} ) ;
84
85
86        # Search in specific column ?
87        if ( defined $opts{c} ) {
88                $find_in_specific_column = $opts{c};
89
90                die "$0: invalid column number ($find_in_specific_column).\n"
91                        unless $find_in_specific_column =~ /^\d+$/ ;
92                       
93                die "$0: invalid column number ($find_in_specific_column).\n"
94                        if $find_in_specific_column <= 0;
95        }
96        else {
97                $find_in_specific_column = 0 ;
98        }
99
100
101        # Output File specified (instead of STDOUT) ?
102        if ( defined $opts{o} ) {
103                my $filename = $opts{o};
104                open $output_file, ">$filename" or die "$0: Failed to create output file '$filename': $!\n" ;
105        } else {
106                $output_file = *STDOUT ;
107        }
108
109
110
111        # Input file Specified (instead of STDIN) ?
112        if ( @ARGV>1 ) {
113                my $filename = $ARGV[1];
114                open $input_file, "<$filename" or die "$0: Failed to open input file '$filename': $!\n" ;
115        } else {
116                $input_file = *STDIN;
117        }
118}
119
120sub load_word_list()
121{
122        open WORDLIST, "<$word_list_file" or die "$0: Failed to open word-list file '$word_list_file'\n" ;
123        my @words ;
124        while ( <WORDLIST> ) {
125                chomp ;
126                s/^\s+//;
127                s/\s+$//;
128                next if length==0;
129                push @words,quotemeta $_;
130        }
131        close WORDLIST;
132
133        die "$0: Error: word-list file '$word_list_file' is empty!\n"
134                unless @words;
135
136        return @words; 
137}
138
139sub compile_regex(@)
140{
141        my @words = @_;
142
143        my $regex_string = join ( '|', @words ) ;
144        if ( $find_complete_words ) {
145                $regex_string = "\\b($regex_string)\\b";
146        }
147        my $regex;
148
149        if ( $find_case_insensitive ) {
150                $regex = qr/$regex_string/i ;
151        } else {
152                $regex = qr/$regex_string/;
153        }
154
155        return $regex;
156}
157
158sub usage()
159{
160print <<EOF;
161
162Word-List Grep
163Copyright (C) 2009 - by A. Gordon ( gordon at cshl dot edu )
164
165Usage: $0 [-o OUTPUT] [-s] [-w] [-i] [-c N] [-v] WORD-LIST-FILE [INPUT-FILE]
166
167   -s   - do not filter first line - always output the first line from the input file.
168   -w   - search for complete words (not partial sub-strings).
169   -i   - case insensitive search.
170   -v   - inverse - output lines NOT matching the word list.
171   -c N - check only column N, instead of entire line (line split by whitespace).
172   -o OUT - specify output file (default = STDOUT).
173   WORD-LIST-FILE - file containing one word per line. These will be used
174          for the search.
175   INPUT-FILE - (optional) read from file (default = from STDIN).
176
177
178
179EOF
180
181        exit;
182}
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。