#! perl -lw use lib "/mit/kenta/.shd/perl/Text-DoubleMetaphone-0.07/lib/perl/5.8.8/"; use Text::DoubleMetaphone qw(double_metaphone); #use Digest::SHA qw(sha512); use Digest::MD5 qw(md5); while (<>) { chomp; next if /'/; $o=$_; s/[\xe2\xe4\xe5]/a/g; s/[\xe7]/c/g; s/[\xe8\xe9\xea]/e/g; s/[\xf1]/n/g; s/[\xf3\xf6]/o/g; s/[\xfb]/u/g; next unless /^[a-z]{2,}$/; next unless /[aeiouy]/; $w{$_}=$o; } if (1) { #skip words that differ only at the end for (sort keys%w) { ($pre=$_); chop$pre; if ($w{$pre}) { #print "$_ $pre"; next; } if (/(.+)ed$/) { if ($w{$1}) { #print "$_ $1"; next; } } if (/(.+(.))(.)ed$/ and $2 eq $3) { if ($w{$1}) { #print "$_ $1"; next; } } if (/(.+)ie[ds]$/) { if ($w{$1."y"}) { #print "$_ ${1}y"; next; } } $w2{$_}=$w{$_} } %w=%w2; } if (1) { #skip one-syllable words for (sort keys %w) { #$s=&syllable_count($_); $i=&is_one_syllable($_); if #(($s==1)!=$i) ($i) { #print "$_ $s $i"; next; } if (length(double_metaphone($_))<4) { next; } $w3{$_}=$w{$_} } %w=%w3; } if (1) { for (0..512) { push @a,{}; } for (sort keys %w) { #print; $code=double_metaphone($_); $codecount{$code}++; #print "$_ $code"; $h=unpack("B*",md5($code)); #figure out which offset has full coverage for $offset (110) { $bh=substr($h,$offset,10); $a[$offset]{$bh}=1 } push @{$aa{$bh}},$_; } $max=0; for (0..118) { $n=scalar(keys %{$a[$_]}); $ctx{$n}++; $max=$n if $n>$max; } for (0..118) { $n=scalar(keys %{$a[$_]}); #print "$_ ",$n if $n==$max; } #print "---"; for (sort keys %ctx) { #print "$_ $ctx{$_}" } } if (1) { for (sort keys %aa) { $min=999999; for (@{$aa{$_}}) { $n=length($_); if ($n<$min) { $min=$n; $mincount=0; } if ($n==$min) { $mincount++; } } $outline=""; $min+=2 if $mincount==1; #$min=3 if $min<3; #$ct=0; $prevd=""; for (@{$aa{$_}}) { $n=length($_); next if $n>$min; #$ct++; $dm=double_metaphone($_); if ($dm ne $prevd) { $outline .= " $dm:"; } $prevd=$dm; $outline .=" $_"; } print "$_ $outline"; } print "--"; print scalar keys %aa," bitstrings"; print scalar(keys %codecount)," double_metaphone codes" ; } sub is_one_syllable{ for ($_[0]) { if (/^[^aeiouy]*[aeiou][^aeiouy]*$/) { return 1; } if (/^[^aeiouy]+y$/) { return 1; } if (/^[^aeiouy]*([aeiou]+)le$/) { return 1; } if (/^[^aeiouy]*[aeiou]+[^aeiouy]*([^ilueoa])e$/) { #print "$1 $_"; return 1; } if (/^[^aeiouy]*[aeiou]+[^aeiouy]*$/ and not /ing$/ and not /iest$/) { return 1; } if (/^[^aeiouy]*[aeiou]+y$/ and not /ooey$/) { return 1; } if (/^[^aeiouy]+y[^aeiouy]*e?$/ and not /le$/) { return 1; } if (/^y[aeiou]+[^aeiouy]*e?$/) { return 1; } if (length $_<4) { #return 1; } } return 0; } sub syllable_count { my $c=0; my $temp=shift; for ($temp) { s/e$//; s/^y/Y/; s/y([aeiou])/Y$1/g; while (/[aeiouy]+/g) { $c++; } } $c=2 if $c<2; #because 1-syllable should have been eliminated $c }