#!/usr/athena/bin/perl -w # # Bengali/Bangla/Assamese input preprocessor based on ITRANS http://www.aczone.com/itrans # # Copyright (C) 2002-2005, Arun A Tharuvai # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA use charnames ":full"; use strict; my %indvowels = ( 'a' => "\N{BENGALI LETTER A}", 'A' => "\N{BENGALI LETTER AA}", 'aa' => "\N{BENGALI LETTER AA}", 'i' => "\N{BENGALI LETTER I}", 'ii' => "\N{BENGALI LETTER II}", 'I' => "\N{BENGALI LETTER II}", 'u' => "\N{BENGALI LETTER U}", 'uu' => "\N{BENGALI LETTER UU}", 'U' => "\N{BENGALI LETTER UU}", 'RRi' => "\N{BENGALI LETTER VOCALIC R}", 'R^i' => "\N{BENGALI LETTER VOCALIC R}", 'RRI' => "\N{BENGALI LETTER VOCALIC RR}", 'R^I' => "\N{BENGALI LETTER VOCALIC RR}", 'LLi' => "\N{BENGALI LETTER VOCALIC L}", 'L^i' => "\N{BENGALI LETTER VOCALIC L}", 'LLI' => "\N{BENGALI LETTER VOCALIC LL}", 'L^I' => "\N{BENGALI LETTER VOCALIC LL}", # 'a.c' => "\N{BENGALI VOWEL SIGN CANDRA E}", 'e' => "\N{BENGALI LETTER E}", 'ai' => "\N{BENGALI LETTER AI}", # 'A.c' => "\N{BENGALI VOWEL SIGN CANDRA O}", 'aa.c' => "\N{BENGALI VOWEL SIGN CANDRA O}", 'o' => "\N{BENGALI LETTER O}", 'au' => "\N{BENGALI LETTER AU}", 'aM' => "\N{BENGALI LETTER A}\N{BENGALI SIGN ANUSVARA}", 'aH' => "\N{BENGALI LETTER A}\N{BENGALI SIGN VISARGA}", 'a.N' => "\N{BENGALI LETTER A}\N{BENGALI SIGN CANDRABINDU}", # '.' => "\N{BENGALI DANDA}", # '..' => "\N{BENGALI DOUBLE DANDA}", # 'AUM' => "\N{BENGALI OM}", # '.a' => "\N{BENGALI SIGN AVAGRAHA}", ); my %depvowels = ( 'A' => "\N{BENGALI VOWEL SIGN AA}", 'aa' => "\N{BENGALI VOWEL SIGN AA}", 'i' => "\N{BENGALI VOWEL SIGN I}", 'ii' => "\N{BENGALI VOWEL SIGN II}", 'I' => "\N{BENGALI VOWEL SIGN II}", 'u' => "\N{BENGALI VOWEL SIGN U}", 'uu' => "\N{BENGALI VOWEL SIGN UU}", 'U' => "\N{BENGALI VOWEL SIGN UU}", 'RRi' => "\N{BENGALI VOWEL SIGN VOCALIC R}", 'R^i' => "\N{BENGALI VOWEL SIGN VOCALIC R}", 'RRI' => "\N{BENGALI VOWEL SIGN VOCALIC RR}", 'R^I' => "\N{BENGALI VOWEL SIGN VOCALIC RR}", 'LLi' => "\N{BENGALI VOWEL SIGN VOCALIC L}", 'L^i' => "\N{BENGALI VOWEL SIGN VOCALIC L}", 'LLI' => "\N{BENGALI VOWEL SIGN VOCALIC LL}", 'L^I' => "\N{BENGALI VOWEL SIGN VOCALIC LL}", # 'a.c' => "\N{BENGALI VOWEL SIGN CANDRA E}", 'e' => "\N{BENGALI VOWEL SIGN E}", 'ai' => "\N{BENGALI VOWEL SIGN AI}", # 'A.c' => "\N{BENGALI VOWEL SIGN CANDRA O}", 'aa.c' => "\N{BENGALI VOWEL SIGN CANDRA O}", 'o' => "\N{BENGALI VOWEL SIGN O}", 'au' => "\N{BENGALI VOWEL SIGN AU}", 'aM' => "N{BENGALI SIGN ANUSVARA}", 'aH' => "\N{BENGALI SIGN VISARGA}", 'a.N' => "\N{BENGALI SIGN CANDRABINDU}", ); my %consonants = ( 'k' => "\N{BENGALI LETTER KA}", 'kh' => "\N{BENGALI LETTER KHA}", 'g' => "\N{BENGALI LETTER GA}", 'gh' => "\N{BENGALI LETTER GHA}", '~N' => "\N{BENGALI LETTER NGA}", 'ch' => "\N{BENGALI LETTER CA}", 'Ch' => "\N{BENGALI LETTER CHA}", 'j' => "\N{BENGALI LETTER JA}", 'hh' => "\N{BENGALI LETTER JHA}", '~n' => "\N{BENGALI LETTER NYA}", 'T'=> "\N{BENGALI LETTER TTA}", 'Th'=> "\N{BENGALI LETTER TTHA}", 'D'=> "\N{BENGALI LETTER DDA}", 'Dh'=> "\N{BENGALI LETTER DDHA}", 'N'=> "\N{BENGALI LETTER NNA}", 't'=> "\N{BENGALI LETTER TA}", 'th'=> "\N{BENGALI LETTER THA}", 'd'=> "\N{BENGALI LETTER DA}", 'dh'=> "\N{BENGALI LETTER DHA}", 'n' => "\N{BENGALI LETTER NA}", 'p' => "\N{BENGALI LETTER PA}", 'ph' => "\N{BENGALI LETTER PHA}", 'b' => "\N{BENGALI LETTER BA}", 'bh' => "\N{BENGALI LETTER BHA}", 'm' => "\N{BENGALI LETTER MA}", 'y' => "\N{BENGALI LETTER YA}", 'r' => "\N{BENGALI LETTER RA}", 'l' => "\N{BENGALI LETTER LA}", # 'v' => "\N{BENGALI LETTER VA}", 'w' => "\N{BENGALI LETTER VA}", 'sh' => "\N{BENGALI LETTER SHA}", 'Sh' => "\N{BENGALI LETTER SSA}", 's' => "\N{BENGALI LETTER SA}", 'h' => "\N{BENGALI LETTER HA}", # 'L' => "\N{BENGALI LETTER LLA}", 'ld' => "\N{BENGALI LETTER LLA}", 'x' => "\N{BENGALI LETTER KA}\N{BENGALI SIGN VIRAMA}\N{BENGALI LETTER SSA}", 'GY' => "\N{BENGALI LETTER JA}\N{BENGALI SIGN VIRAMA}\N{BENGALI LETTER NYA}", 'dny' => "\N{BENGALI LETTER JA}\N{BENGALI SIGN VIRAMA}\N{BENGALI LETTER NYA}", # 'q' => "\N{BENGALI LETTER QA}", # 'K' => "\N{BENGALI LETTER KHHA}", # 'G' => "\N{BENGALI LETTER GHHA}", # 'z' => "\N{BENGALI LETTER ZA}", 'J' => "\N{BENGALI LETTER ZA}", # 'f' => "\N{BENGALI LETTER FA}", # '.D' => "\N{BENGALI LETTER DDDHA}", '.Dh' => "\N{BENGALI LETTER RHA}", 'Y' => "\N{BENGALI LETTER YYA}", 'R' => "\N{BENGALI LETTER RRA}", ); sub detvowel { my ($l1,$l2,$l3,$l4) = @_; my $outstr = ""; my $used=0; if (defined $depvowels{$l1 . $l2}) { $used = 2; $outstr = $depvowels{$l1 . $l2}; } elsif (defined $depvowels{$l1}) { $used = 1; $outstr = $depvowels{$l1}; } elsif (defined $depvowels{$l1 . $l2 . $l3}) { $used = 3; $outstr = $depvowels{$l1 . $l2 . $l3}; } elsif ($l1 eq 'a') { $used = 1; $outstr = ""; } else { $used = 0; $outstr = "\N{BENGALI SIGN VIRAMA}"; } return ($used,$outstr); } while (<>) { my $outstr = ""; my $outstr2 = ""; my @line = split //; my $size = $#line +1; my @output = ""; my $used; my $counter = 0; while ($counter < $size) { $outstr2 = ""; if (defined $consonants{$line[$counter] . $line[$counter + 1]}) { $outstr .= $consonants{$line[$counter] . $line[$counter+1]}; ($used,$outstr2) = detvowel($line[$counter+2],$line[$counter+3],$line[$counter+4],$line[$counter+5]); $outstr .= $outstr2; $counter+=2; $counter+=$used; } elsif (defined $consonants{$line[$counter]}) { $outstr .= $consonants{$line[$counter]}; ($used,$outstr2) = detvowel($line[$counter+1],$line[$counter+2],$line[$counter+3],$line[$counter+4]); $outstr .= $outstr2; $counter++; $counter+=$used; } elsif (defined $consonants{$line[$counter] . $line[$counter +1] . $line[$counter+2]}) { $outstr .= $consonants{$line[$counter]. $line[$counter +1] . $line[$counter+2]}; ($used,$outstr2) = detvowel($line[$counter+3],$line[$counter+4],$line[$counter+5],$line[$counter+6]); $outstr .= $outstr2; $counter+=3; $counter+=$used; } elsif (defined $indvowels{$line[$counter] . $line[$counter+1]}) { $outstr .= $indvowels{$line[$counter] . $line[$counter+1]}; $counter+=2; } elsif (defined $indvowels{$line[$counter]}) { $outstr .= $indvowels{$line[$counter]}; $counter++; } elsif ($line[$counter] =~ /\s/) { $outstr .= $line[$counter]; $counter++; } else { print $line[$counter]; $counter++; } } print "${outstr}\n"; }