#!/usr/athena/bin/perl -w # Tamil input preprocessor using the itrans transliteration method # ## Copyright (C) 2002-2005, Arun A Tharuvai ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 2 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program; if not, write to the Free Software ## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use charnames ":full"; use strict; my %indvowels = ( 'a' => "\N{TAMIL LETTER A}", 'A' => "\N{TAMIL LETTER AA}", 'aa' => "\N{TAMIL LETTER AA}", 'i' => "\N{TAMIL LETTER I}", 'ii' => "\N{TAMIL LETTER II}", 'I' => "\N{TAMIL LETTER II}", 'u' => "\N{TAMIL LETTER U}", 'uu' => "\N{TAMIL LETTER UU}", 'U' => "\N{TAMIL LETTER UU}", 'e' => "\N{TAMIL LETTER E}", 'ee' => "\N{TAMIL LETTER EE}", 'E' => "\N{TAMIL LETTER EE}", 'ai' => "\N{TAMIL LETTER AI}", 'o' => "\N{TAMIL LETTER O}", 'oo' => "\N{TAMIL LETTER OO}", 'O' => "\N{TAMIL LETTER OO}", 'au' => "\N{TAMIL LETTER AU}", 'q' => "\N{TAMIL SIGN VISARGA}", ); my %depvowels = ( # 'a' => "\N{TAMIL VOWEL SIGN A}", 'A' => "\N{TAMIL VOWEL SIGN AA}", 'aa' => "\N{TAMIL VOWEL SIGN AA}", 'i' => "\N{TAMIL VOWEL SIGN I}", 'ii' => "\N{TAMIL VOWEL SIGN II}", 'I' => "\N{TAMIL VOWEL SIGN II}", 'u' => "\N{TAMIL VOWEL SIGN U}", 'uu' => "\N{TAMIL VOWEL SIGN UU}", 'U' => "\N{TAMIL VOWEL SIGN UU}", 'e' => "\N{TAMIL VOWEL SIGN E}", 'ee' => "\N{TAMIL VOWEL SIGN EE}", 'E' => "\N{TAMIL VOWEL SIGN EE}", 'ai' => "\N{TAMIL VOWEL SIGN AI}", 'o' => "\N{TAMIL VOWEL SIGN O}", 'oo' => "\N{TAMIL VOWEL SIGN OO}", 'O' => "\N{TAMIL VOWEL SIGN OO}", 'au' => "\N{TAMIL VOWEL SIGN AU}", ); my %consonants = ( 'k' => "\N{TAMIL LETTER KA}", 'g' => "\N{TAMIL LETTER KA}", '~N' => "\N{TAMIL LETTER NGA}", 'N^' => "\N{TAMIL LETTER NGA}", 'ch' => "\N{TAMIL LETTER CA}", '~n' => "\N{TAMIL LETTER NYA}", 'T'=> "\N{TAMIL LETTER TTA}", 'Th'=> "\N{TAMIL LETTER TTA}", 'N'=> "\N{TAMIL LETTER NNA}", 't'=> "\N{TAMIL LETTER TA}", 'th'=> "\N{TAMIL LETTER TA}", 'n' => "\N{TAMIL LETTER NA}", 'p' => "\N{TAMIL LETTER PA}", 'b' => "\N{TAMIL LETTER PA}", 'm' => "\N{TAMIL LETTER MA}", 'y' => "\N{TAMIL LETTER YA}", 'r' => "\N{TAMIL LETTER RA}", 'l' => "\N{TAMIL LETTER LA}", 'v' => "\N{TAMIL LETTER VA}", 'w' => "\N{TAMIL LETTER VA}", 'J' => "\N{TAMIL LETTER LLLA}", 'z' => "\N{TAMIL LETTER LLLA}", 'L' => "\N{TAMIL LETTER LLA}", 'R' => "\N{TAMIL LETTER RRA}", '^n'=> "\N{TAMIL LETTER NNNA}", 'j' => "\N{TAMIL LETTER JA}", 'Sh' => "\N{TAMIL LETTER SSA}", 's' => "\N{TAMIL LETTER SA}", 'h' => "\N{TAMIL LETTER HA}", 'x' => "\N{TAMIL LETTER KA}\N{TAMIL SIGN VIRAMA}\N{TAMIL LETTER SSA}"); sub detvowel { my ($l1,$l2) = @_; my $outstr = ""; my $used=0; if (defined $depvowels{$l1 . $l2}) { $used = 2; $outstr = $depvowels{$l1 . $l2}; } elsif (defined $depvowels{$l1}) { $used = 1; $outstr = $depvowels{$l1}; } elsif ($l1 eq 'a') { $used = 1; $outstr = ""; } else { $used = 0; $outstr = "\N{TAMIL SIGN VIRAMA}"; } return ($used,$outstr); } sub detvowelold { my ($l1,$l2) = @_; my $outstr =""; my $used=0; if ($l1 eq 'a') { if ($l2 eq 'i') { $outstr .= "\N{TAMIL VOWEL SIGN AI}"; $used =2; } elsif ($l2 eq 'a') { $outstr .= "\N{TAMIL VOWEL SIGN AA}"; $used =2; } elsif ($l2 eq 'u') { $outstr .= "\N{TAMIL VOWEL SIGN AU}"; } else { $used = 1; } } elsif ($l1 eq 'A') { $outstr .= "\N{TAMIL VOWEL SIGN AA}"; $used = 1; } elsif ($l1 eq 'i') { if ($l2 eq 'i') { $outstr .= "\N{TAMIL VOWEL SIGN II}"; $used = 2; } else { $outstr .= "\N{TAMIL VOWEL SIGN I}"; $used = 1; } } elsif ($l1 eq 'I') { $outstr .= "\N{TAMIL VOWEL SIGN II}"; $used = 1; } elsif ($l1 eq 'u') { if ($l2 eq 'u') { $outstr .= "\N{TAMIL VOWEL SIGN UU}"; $used =2; } else { $outstr .= "\N{TAMIL VOWEL SIGN U}"; $used =1; } } elsif ($l1 eq 'U') { $outstr .= "\N{TAMIL VOWEL SIGN UU}"; $used = 1; } elsif ($l1 eq 'e') { $outstr .= "\N{TAMIL VOWEL SIGN E}"; $used = 1; } elsif ($l1 eq 'E') { $outstr .= "\N{TAMIL VOWEL SIGN EE}"; $used = 1; } elsif ($l1 eq 'o') { $outstr .= "\N{TAMIL VOWEL SIGN O}"; $used = 1; } elsif ($l1 eq 'O') { $outstr .= "\N{TAMIL VOWEL SIGN OO}"; $used = 1; } else { $outstr .= "\N{TAMIL SIGN VIRAMA}"; $used = 0; } return ($used,$outstr); } while (<>) { my $outstr = ""; my $outstr2 = ""; my @line = split //; my $size = $#line +1; my @output = ""; my $used; my $counter = 0; while ($counter < $size) { $outstr2 = ""; if (defined $consonants{$line[$counter] . $line[$counter + 1]}) { $outstr .= $consonants{$line[$counter] . $line[$counter+1]}; ($used,$outstr2) = detvowel($line[$counter+2],$line[$counter+3]); $outstr .= $outstr2; $counter+=2; $counter+=$used; } elsif (defined $consonants{$line[$counter]}) { $outstr .= $consonants{$line[$counter]}; ($used,$outstr2) = detvowel($line[$counter+1],$line[$counter+2]); $outstr .= $outstr2; $counter++; $counter+=$used; } elsif (defined $indvowels{$line[$counter] . $line[$counter+1]}) { $outstr .= $indvowels{$line[$counter] . $line[$counter+1]}; $counter+=2; } elsif (defined $indvowels{$line[$counter]}) { $outstr .= $indvowels{$line[$counter]}; $counter++; } elsif ($line[$counter] =~ /\s/) { $outstr .= $line[$counter]; $counter++; } else { print $line[$counter]; $counter++; } } print "${outstr}\n"; }