#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Pod::Usage; use File::Path; my ( $help, $man ); my $verbose = 0; my $input_training_file; my $input_testing_file; my $model_dir; my $feature_dir; my $result_dir; my $do_stemming; my $vocab_file; my $log_odds_cutoff = 0; my $feature_choice = "mi"; GetOptions( 'help|?' => \$help, man => \$man, 'verbose+' => \$verbose, 'input-training-file=s' => \$input_training_file, 'input-testing-file=s' => \$input_testing_file, 'model-dir=s' => \$model_dir, 'feature-dir=s' => \$feature_dir, 'result-dir=s' => \$result_dir, 'stem' => \$do_stemming, 'vocab-file=s' => \$vocab_file, 'log-odds-cutoff=s' => \$log_odds_cutoff, 'feature-type=s' => \$feature_choice, ) or pod2usage(2); pod2usage(1) if ($help); pod2usage( -exitstatus => 0, -verbose => 2 ) if ($man); pod2usage("-input-training-file or -input-testing-file required!\n") unless($input_training_file or $input_testing_file); pod2usage("only one of -input-training-file and -input-testing-file allowed!\n") if($input_training_file and $input_testing_file); pod2usage("-model-dir required!\n") unless($model_dir); pod2usage("-feature-dir required!\n") unless($feature_dir); pod2usage("-vocab-file required when testing!\n") if($input_testing_file and not $vocab_file); pod2usage("-result-dir required when testing!\n") if($input_testing_file and not $result_dir); $vocab_file = "$feature_dir/vocab" unless($vocab_file); # print start time system("date"); # read data my ($ref, $text); if($input_training_file) { ($ref, $text) = preprocess($input_training_file); } else { ($ref, $text) = preprocess($input_testing_file); } # stem words stem($text) if($do_stemming); # run word counting my $counts = count_words($text); # run feature generation make_mi_features($counts,$ref,$vocab_file,$feature_dir); if($input_training_file) { # train svms build_svms($model_dir,"$feature_dir/train.*"); } else { # test svms classify_testset($model_dir, "$feature_dir/testset", $ref, $result_dir); } # print finish time system("date"); ######################################## ######## Sub routines ######################################## sub preprocess { my ($infile, $outfile, $keyfile) = @_; open(my $infh, $infile) || die "Could not open $infile\n"; my @text; my @ref; while(<$infh>) { chomp; #remove leading whitespace $_ =~ s/^\s*//; #remove whitespace around delimeter $_ =~ s/\s*\|\s*/|/g; my @line = split(/\|/, $_); if($#line != 1) { print STDERR "Found $#line columns in: ", join(" | ", @line), "\n"; print STDERR "Expected only 2 columns (with delimiter '|')\n"; next; } push @ref, $line[0]; # clean up text (no quotes) $line[1] =~ tr/A-Z/a-z/; $line[1] =~ y/a-z0-9/ /cs; $line[1] =~ s/^\s+//; my @words = split(/\s+/, $line[1]); push @text, \@words; } return (\@ref, \@text); } ## Porter word stemming (code from doc2mat.pl, CLUTO preprocessing script from George Karypis karypis@cs.umn.edu) sub stem { my ($textlines) = @_; #============================================================================== # Setup the data-structures for the stemmer and initialize it #============================================================================== my %step2list = ('ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance', 'izer'=>'ize', 'bli'=>'ble', 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous', 'aliti'=>'al', 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'); my %step3list = ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic', 'ful'=>'', 'ness'=>''); my $c = "[^aeiou]"; # consonant my $v = "[aeiouy]"; # vowel my $C = "${c}[^aeiouy]*"; # consonant sequence my $V = "${v}[aeiou]*"; # vowel sequence my $mgr0 = "^(${C})?${V}${C}"; # [C]VC... is m>0 my $meq1 = "^(${C})?${V}${C}(${V})?" . '$'; # [C]VC[V] is m=1 my $mgr1 = "^(${C})?${V}${C}${V}${C}"; # [C]VCVC... is m>1 my $_v = "^(${C})?${v}"; # vowel in stem my @stem_params = (\%step2list, \%step3list, $c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v); for my $line (@{$textlines}) { for(my $n=0; $n<@{$line}; $n++) { $line->[$n] = stem_word($line->[$n], @stem_params); } } } sub stem_word { my ($word, $step2list, $step3list, $c, $v, $C, $V, $mgr0, $meq1, $mgr1, $_v) = @_; my ($stem, $suffix, $firstch); my $w = $word; if (length($w) < 3) { return $w; } # length at least 3 # now map initial y to Y so that the patterns never treat it as vowel: $w =~ /^./; $firstch = $&; if ($firstch =~ /^y/) { $w = ucfirst $w; } # Step 1a if ($w =~ /(ss|i)es$/) { $w=$`.$1; } elsif ($w =~ /([^s])s$/) { $w=$`.$1; } # Step 1b if ($w =~ /eed$/) { if ($` =~ /$mgr0/o) { chop($w); } } elsif ($w =~ /(ed|ing)$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem; if ($w =~ /(at|bl|iz)$/) { $w .= "e"; } elsif ($w =~ /([^aeiouylsz])\1$/) { chop($w); } elsif ($w =~ /^${C}${v}[^aeiouwxy]$/o) { $w .= "e"; } } } # Step 1c if ($w =~ /y$/) { $stem = $`; if ($stem =~ /$_v/o) { $w = $stem."i"; } } # Step 2 if ($w =~ /(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/) { $stem = $`; $suffix = $1; if ($stem =~ /$mgr0/o) { $w = $stem . $step2list->{$suffix}; } } # Step 3 if ($w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/) { $stem = $`; $suffix = $1; if ($stem =~ /$mgr0/o) { $w = $stem . $step3list->{$suffix}; } } # Step 4 if ($w =~ /(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/) { $stem = $`; if ($stem =~ /$mgr1/o) { $w = $stem; } } elsif ($w =~ /(s|t)(ion)$/) { $stem = $` . $1; if ($stem =~ /$mgr1/o) { $w = $stem; } } # Step 5 if ($w =~ /e$/) { $stem = $`; if ($stem =~ /$mgr1/o or ($stem =~ /$meq1/o and not $stem =~ /^${C}${v}[^aeiouwxy]$/o)) { $w = $stem; } } if ($w =~ /ll$/ and $w =~ /$mgr1/o) { chop($w); } # and turn initial Y back to y if ($firstch =~ /^y/) { $w = lcfirst $w; } return $w; } sub count_words { my ($textlines) = @_; my @counts; for my $line (@{$textlines}) { my @line = @{$line}; my %words; my $num_words = @line; for(my $n=0;$n<@line;$n++) { $words{$line[$n]} ++; } $words{'!numwords!'} = $num_words; push @counts, \%words; #print $outfh "!numwords!:$num_words "; #foreach my $word (keys %words) { #print $outfh "$word:$words{$word} "; #} #print $outfh "\n"; } return \@counts; } # # make_mi_features -- make mutual information features # sub make_mi_features { my ($counts, $ref, $vocab_file, $outdir) = @_; my (%categories); print "Get key\n" if ($verbose); get_key($ref, \%categories); mkpath([$outdir]); # collect the word frequency data my @all_words; my %word_ids; my @id_words; my @my_lines; my @my_line_cats; my $vocabfh; if($input_testing_file) { open($vocabfh, "$vocab_file") or die "Could not open $vocab_file\n"; read_vocab($vocabfh, \@all_words, \%word_ids); } else { if(-f $vocab_file) { warn "Vocab $vocab_file already exists, overwriting!\n"; } open($vocabfh, ">$vocab_file") or die "Could not open $vocab_file\n"; } # open output file(s) for features my %my_files = create_files(\%categories, $outdir); print "Collect frequency\n" if($verbose); collect_freq($counts, $vocabfh, \@all_words, \%word_ids, \@id_words, \@my_lines, \%my_files); # output our features if(not $input_testing_file) { print "Output fetures\n" if($verbose); output_features(\@my_lines, $ref, \%my_files, \@all_words, $log_odds_cutoff); } } sub get_key { my ($ref, $categories) = @_; for my $cat (@{$ref}) { if(exists $categories->{$cat}) { $categories->{$cat} += 1; } else { print "Add category $cat\n" if($verbose); $categories->{$cat} = 1; } } } sub create_files { my ($categories, $out_dir) = @_; my %my_files; my $test_fh; my $testkey_fh; if($input_testing_file) { open($test_fh, ">$out_dir/testset") or die "Could not open $out_dir/testset: $!\n"; open($testkey_fh, ">$out_dir/key.testset") or die "Could not open $out_dir/key.testset: $!\n"; } # assign a feature output file for each category # if we are testing, it will all be the same file foreach my $cat (keys %{$categories}) { print "Found $cat $categories->{$cat}\n" if($verbose); unless($input_testing_file) { open(my $fh, ">$out_dir/train.$cat") or die "Could not open file $out_dir/train.$cat\n"; open(my $keyfh, ">$out_dir/key_train.$cat") or die "Could not open file $out_dir/key_train.$cat\n"; $my_files{$cat} = $fh; $my_files{"key.$cat"} = $keyfh; } else { $my_files{$cat} = $test_fh; $my_files{"key.$cat"} = $testkey_fh; } } return %my_files; } sub collect_freq { my ($counts, $vocabfh, $all_words, $word_ids, $id_words, $my_lines, $my_files) = @_; my $vocab_cnt=1; my $my_example_cnt = 0; for(my $n=0; $n<@{$counts}; $n++) { my $count = $counts->[$n]; my @my_line; my $numwords = $count->{'!numwords!'}; $my_example_cnt++; for my $word (keys %{$count}) { my $count = $count->{$word}; my $word_freq = $count / $numwords; if(not $input_testing_file) { unless(exists $word_ids->{$word}) { $all_words->[$vocab_cnt] = $word_freq; $id_words->[$vocab_cnt] = $word; $word_ids->{$word} = $vocab_cnt++; } else { $all_words->[$word_ids->{$word}] += $word_freq; } } push @my_line, [$word_ids->{$word},$word_freq] if(exists $word_ids->{$word}); } # if training, store lines to output per category if(not $input_testing_file) { push @{$my_lines}, \@my_line; } # if testing, output features here (no need to separate into categories) else { output_feature_line(\@my_line,$ref->[$n],$my_files,$all_words, $log_odds_cutoff); } } if(not $input_testing_file) { #compute vocab averages (these average over examples, and frequencies #are not weighted by how many words were in a particular example) for(my $n=1; $n<@{$all_words};$n++) { warn "Index $n $all_words->[$n]\n" if($verbose>1); $all_words->[$n] = $all_words->[$n] / $my_example_cnt; print $vocabfh "$id_words->[$n] $word_ids->{$id_words->[$n]} $all_words->[$n]\n"; } } } sub output_features { my ($my_lines, $ref, $my_files, $all_words, $log_odds_cutoff) =@_; for(my $n=0; $n<@{$my_lines}; $n++) { my @my_line = @{$my_lines->[$n]}; output_feature_line(\@my_line,$ref->[$n],$my_files,$all_words, $log_odds_cutoff); } } sub output_feature_line { my ($my_line, $my_cat, $my_files, $all_words, $log_odds_cutoff) =@_; my @my_line = @{$my_line}; my $my_file; my $my_key_file; if(exists $my_files->{$my_cat}) { $my_file = $my_files->{$my_cat}; $my_key_file = $my_files->{"key.$my_cat"}; } else { die "Could not find my cat '$my_cat'\n"; } @my_line = sort {$a->[0] <=> $b->[0]} @my_line; print $my_file "0 " if($input_testing_file); #print dummy class for testing for(my $m=0; $m<@my_line;$m++) { my ($word_id, $word_freq) = @{$my_line[$m]}; my $word_log_odds = log($word_freq / $all_words->[$word_id]); print $my_file "$word_id:$word_log_odds " if($word_log_odds > $log_odds_cutoff); } print $my_file "\n"; print $my_key_file "$my_cat\n"; } sub read_vocab { my ($vocab_fh, $all_words, $word_ids) = @_; while(<$vocab_fh>) { chomp; my @line = split; my $word = $line[0]; my $word_id = $line[1]; my $word_freq = $line[2]; $all_words->[$word_id] = $word_freq; $word_ids->{$word} = $word_id; } close($vocab_fh); } sub build_svms { my ($model_dir, $train_glob) = @_; mkpath([$model_dir]); my @train_files = glob "$train_glob"; # read in training feature lines my @train_cats; my @train_names; foreach my $train_file (@train_files) { open(my $trainfh, $train_file) || die "Could not open file $train_file\n"; my @lines; while(<$trainfh>) { chomp; push @lines, $_; } close($trainfh); my $cat; if($train_file =~ /train\.(.*)$/) { $cat = $1; } else { die "Could not determine training category name from '$train_file'\n"; } $train_cats[@train_cats] = \@lines; $train_names[@train_names] = $cat; } # build SVM pairwise training files for(my $n=0; $n < @train_cats; $n++) { for(my $m=$#train_cats; $m>$n; $m--) { print "Build SVM for $train_names[$n] vs $train_names[$m]\n"; open(my $bldfh, ">$model_dir/bld") or die "Could not open $model_dir/bld\n"; my @train_lines1 = @{$train_cats[$n]}; my @train_lines2 = @{$train_cats[$m]}; my $max_cnt = @train_lines1; $max_cnt = @train_lines2 if(@train_lines2 > @train_lines1); $max_cnt = @train_lines1; for(my $l=0;$l<$max_cnt;$l++) { my $index = $l; while($index > $#train_lines1) { $index -= $#train_lines1; } print $bldfh "1 $train_lines1[$index]\n"; } $max_cnt = @train_lines2; for(my $l=0;$l<$max_cnt;$l++) { my $index = $l; while($index > $#train_lines2) { $index -= $#train_lines2; } print $bldfh "-1 $train_lines2[$index]\n"; } close($bldfh); #train svms my $command = "svm_learn $model_dir/bld $model_dir/svm.$train_names[$n].$train_names[$m] > tmp"; print "$command\n" if($verbose); system($command) == 0 or die "failed to execute: \"$command\" with error: $?\n"; unlink "$model_dir/bld"; } } } sub classify_testset { my ($model_dir, $test_file, $test_ref, $output_dir) = @_; my $score_dump = 0; mkpath([$output_dir]); # get all the model files my @models = `ls $model_dir`; # run svm classification for each svm model my @results; my @all_results; print "Found models: ", join(" ", @models), "\n" if($verbose>1); foreach my $model (@models) { chomp $model; my $pos_model; my $neg_model; if($model =~ /svm\.(.*)\.(.*)$/) { $pos_model = $1; $neg_model = $2; } else { die "Couldn't match model name '$model'\n"; } print "Run model $model\n"; my $command="svm_classify $test_file $model_dir/$model $output_dir/$model.results > tmp"; print "$command\n" if($verbose); system($command) == 0 or die "failed to execute: \"$command\" with error: $?\n"; open(my $resultfh, "$output_dir/$model.results") or die "Could not open file $output_dir/$model.results\n"; my $line_cnt=0; while(<$resultfh>) { chomp; my $hash_pointer; unless(defined $results[$line_cnt]) { my %hash; my %all; $results[$line_cnt] = \%hash; $all_results[$line_cnt] = \%all if($score_dump); } my $winner; my $loser; my $score; if($_ > 0) { $winner = $pos_model; $loser = $neg_model; $score = $_; } else { $winner = $neg_model; $loser = $pos_model; $score = $_ * -1; } # record winning model ${$results[$line_cnt]}{$winner} ++; # record model pair ${$all_results[$line_cnt]}{"$winner.$loser"} = $score if($score_dump); $line_cnt++; } } # read in test key # open(my $keyfh, $test_key) || die "Could not open file $test_key\n"; # my @key; # while(<$keyfh>) { # chomp; # my @line = split; # push @key, \@line; # } open(my $rankfh, ">$output_dir/rank") or die "Could not open $output_dir/rank\n"; my $scorefh; if($score_dump) { open($scorefh, ">$output_dir/compare-scores") or die "Could not open $output_dir/compare-scores\n"; } die "Different number of lines: $#results result lines and $#{$test_ref} key lines\n" unless($#results eq $#{$test_ref}); # collect results my %confusion_table; my $correct_cnt; for(my $n=0; $n<@results; $n++) { my $max=-1; my $max_model=''; my @rank; foreach my $model (keys %{$results[$n]}) { if(${$results[$n]}{$model} > $max) { $max = ${$results[$n]}{$model}; $max_model = $model; } push @rank, [ $model, ${$results[$n]}{$model} ]; } @rank = sort {$b->[1] <=> $a->[1]} @rank; print $rankfh "$test_ref->[$n] || "; # print out the models in rank order for(my $m=0; $m<@rank; $m++) { print $rankfh "$rank[$m]->[0] "; } print $rankfh "\n"; #print out scores to compare against winning model if($score_dump) { my $first_second = "$rank[0]->[0].$rank[1]->[0]"; my $first_third = "$rank[0]->[0].$rank[2]->[0]"; my $second_third = "$rank[1]->[0].$rank[2]->[0]"; print $scorefh "$test_ref->[$n] | $first_second | ", get_score($all_results[$n], $first_second), " | $first_third | ", get_score($all_results[$n], $first_third), " | $second_third | ", get_score($all_results[$n], $second_third), "\n"; } $correct_cnt++ if($max_model eq $test_ref->[$n]); $confusion_table{$test_ref->[$n]}{$max_model} += 1; } print "Accuracy: ", $correct_cnt / @results, "\n"; my %cat_totals; ## print out confusion table of results print "\n\nCount of confusion occurances, true classes are\n"; print " the first column, predictions are the first row\n"; print " ", join(" ", sort keys %confusion_table), "\n"; foreach my $truth (sort keys %confusion_table) { printf "% 2s: ", $truth; foreach my $prediction (sort keys %confusion_table ) { $confusion_table{$truth}{$prediction} = 0 unless(exists $confusion_table{$truth} and $confusion_table{$truth}{$prediction}); printf "% 5d ", $confusion_table{$truth}{$prediction}; $cat_totals{$truth} += $confusion_table{$truth}{$prediction}; } print "\n"; } print "\n\nPercentage relative to number of truth examples\n"; print " ", join(" ", sort keys %confusion_table), "\n"; foreach my $truth (sort keys %confusion_table) { printf "% 2s: ", $truth; foreach my $prediction (sort keys %confusion_table ) { printf "%0.3f ", $confusion_table{$truth}{$prediction} / $cat_totals{$truth}; } print "\n"; } print "\n\nPercentage of overall examples\n"; print " ", join(" ", sort keys %confusion_table), "\n"; foreach my $truth (sort keys %confusion_table) { printf "% 2s: ", $truth; foreach my $prediction (sort keys %confusion_table ) { printf "%0.3f ", $confusion_table{$truth}{$prediction} / @results; } print "\n"; } } # return SVM one-vs-one score sub get_score { my ($results, $pair) = @_; if(exists ${$results}{$pair}) { return ${$results}{$pair}; } else { my ($winner, $loser) = split('\.', $pair); my $reverse_pair = "$loser.$winner"; if(exists ${$results}{$reverse_pair}) { return (-1 * ${$results}{$reverse_pair}); } else { die "Couldn't find score for pair: $pair or $reverse_pair\n"; } } } __END__ =head1 NAME run-svm - control script for multiclass svm training with feature generation from text =head1 SYNOPSIS run-svm version 0.1 Copyright 2004-2007 Dustin Hillard and Stephen Purpura Contact us at sp559@cs.cornell.edu if you wish to use this script for commercial purposes. Modification and use for non-commercial purposes is allowed, but please cite. The authors make no warranties about the applicability of this software to any outcome. run-svm [options] Options: -help brief help message -man full documentation --verbose more verbose to STDERR --input-training-file file that has data to train on --input-testing-file file that has data to test on --model-dir directory with trained svm models to use in testing --output-dir directory we'll put the files in --feature feature options include {mi, tf, tfidf, tfidfL1, tfidfL2, tfidfpivot} =head1 OPTIONS =over =item B<--help> =item B<-?> Show this help message. =item B<--man> Show the manual page for this script. =item B<--verbose> Repeatable option. Report more of what we're doing. =back =head1 DESCRIPTION This control script front-ends SVMlight to perform multiclass classification using text as features. =head2 CAVEATS (1) This program has only been tested under Linux and MacOS (2) svm_learn and svm_classify must be in the path. An example for bash: export PATH=$PATH:/home/iuser/svm_scripts =head2 TO DO =head1 AUTHORS Dustin Hillard Ehillard@ee.washington.eduE Stephen Purpura Esp559@cs.cornell.eduE =cut