#!/usr/bin/perl

=pod

=head1 NAME

geneal2gedcom.pl - convert McBeath genealogy format to GEDCOM

=head1 SYNOPSIS

geneal2gedcom.pl [B<-->[B<no>]B<strict>] [B<-->[B<no>]B<strict-addresses>]
	[B<--output-file> I<file-name>] [I<input-file-name>]

=head1 DESCRIPTION

B<READ ALL OF THIS BEFORE YOU USE THIS SCRIPT.>

B<Please send me email (L<mailto:jik@alum.mit.edu>) if you use this
script.> I would really like to know if anyone else finds it useful.

This script converts a data file in Jim McBeath's genealogy data file
format, used by his "jimmc.roots" and "geneal" packages (see
L<http://alumnus.caltech.edu/~jimmc/freesw.html>), into GEDCOM format
(see
L<http://homepages.rootsweb.ancestry.com/~pmcbride/gedcom/55gctoc.htm>).

This script requires the Gedcom and Geo::StreetAddress::US Perl
modules, which you will probably need to install from CPAN (see
L<http://search.cpan.org/>).  If you don't know how to install a
module from CPAN, see L<http://cpan.org/modules/INSTALL.html>; please
don't contact me for help with this.

I wrote this specifically to convert my own data file into GEDCOM,
so there's no reason to believe it's completely generalized.  I only
know for certain that it works with the tags that I used in my data
file and the format that I used for those tags.  If you need to
modify this script to make it support more tags or enhance its
support for existing tags, please send me patches so I can
incorporate them into my version for other people to use!

=head1 OPTIONS

Specify the input file on the command line or feed the input to stdin.

The following options are available:

=over 4

=item --strict, --nostrict

Enable (default) or disable strict checking.  When strict checking is
enabled, an unrecognized tag in the input causes the script to exit
with an error.  When strict checking is disabled, a warning is printed
and the script ignored the unrecognized tag

=item --strict-addresses, --nostrict-addresses

Enable (default) or disable strict address checking.  When strict
address checking is enabled, and unparseable address in the input
causes the script to exit with an error.  When strict address checking
is disabled, a warning is printed, and the script ignores the
unparseable address.

=item --output-file file-name

Send the gedcom output to the specified file.  If not specified,
output is sent to stdout.

=back

=head1 AUTHOR

Jonathan Kamens L<mailto:jik@alum.mit.edu>

=head1 COPYRIGHT

Copyright (C) 2009 Jonathan Kamens.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

See L<http://www.gnu.org/licenses/>.

=head1 VERSION

$Id: geneal2gedcom.pl,v 1.8 2012/12/31 16:28:54 jik Exp $

=cut

use strict;
use warnings;

use File::Basename;
use File::Temp qw(tempfile);
use Gedcom 1.17;
use Geo::StreetAddress::US;
use Getopt::Long;

$/ = "\n\n";

my $whoami = basename $0;
my $usage = "Usage: $whoami [--[no]strict] [--[no]strict-addresses
	[--output-file file-name] [input-file-name]\n";

my $strict = 1;
my $strict_addresses = 1;
my $output_file;

die $usage if (! GetOptions("strict!" => \$strict,
			    "strict-addresses!" => \$strict_addresses,
			    "output-file=s" => \$output_file));

my $ged = Gedcom->new;
my (%link_child_to_family, %link_spouse_to_family, %individual_records,
    %link_address_to_individual, %ignore_address_link);

while (<>) {
    next if (! s/^(\d+).*\n//);
    my $record_identifier = $1;
    next if (! s/^T:(.)\n?//m);
    my $record_type = $1;
    my(@fields);
    for my $field_string (split(/\n\b/)) {
	my($tag, $value) = split(/:/, $field_string, 2);
	next if (! $tag);
	$value =~ s/^\+//gm;
	push(@fields, [$tag, $value]);
    }
    if ($record_type eq "I") {
	my $record = $ged->add_individual;
	my($first_name, $middle_name, $last_name, $nick_name, $name_prefix,
	   $birth, $death, $burial, @extra_names);
	for my $ref (@fields) {
	    my($tag, $value) = @{$ref};
	    if ($tag eq "LN") {
		$last_name = $value;
	    }
	    elsif ($tag eq "LN.A") {
		push(@extra_names, $value);
	    }
	    elsif ($tag eq "FN") {
		$first_name = $value;
	    }
	    elsif ($tag eq "MN") {
		$middle_name = $value;
	    }
	    elsif ($tag eq "NN") {
		$nick_name = $value;
	    }
	    elsif ($tag eq "PN") {
		$name_prefix = $value;
	    }
	    elsif ($tag eq "SX") {
		$record->add("SEX", $value);
	    }
	    elsif ($tag eq "B") {
		$birth = $record->add("BIRT") if (! $birth);
		$birth->add("DATE", $value);
	    }
	    elsif ($tag eq "BP") {
		$birth = $record->add("BIRT") if (! $birth);
		$birth->add("PLAC", $value);
	    }
	    elsif ($tag eq "D") {
		$death = $record->add("DEAT") if (! $death);
		$death->add("DATE", $value);
	    }
	    elsif ($tag eq "DP") {
		$death = $record->add("DEAT") if (! $death);
		$death->add("PLAC", $value);
	    }
	    elsif ($tag eq "BUR") {
		$burial = $record->add("BURI") if (! $burial);
		$burial->add("PLAC", $value);
	    }
	    elsif ($tag eq "P") {
		push(@{$link_child_to_family{$value}}, $record_identifier);
	    }
	    elsif ($tag eq "S") {
		$value =~ s/^\s+//;
		$value =~ s/\s+$//;
		for my $family (split(/[,\s]+/, $value)) {
		    push(@{$link_spouse_to_family{$family}}, $record_identifier);
		}
	    }
	    elsif ($tag eq "ADDR") {
		# Skip current addresses
		push(@{$link_address_to_individual{$value}}, $record_identifier);
	    }
	    elsif ($tag eq "TNOTE" or $tag eq "GEN" or $tag eq "TGEN" or $tag eq "COM") {
		&add_multiline($record, "NOTE", $value);
	    }
	    elsif ($tag eq "EDITED") {
		$record->add("CHAN")->add("DATE", $value);
	    }
	    elsif ($strict) {
		die "Bad field: $tag:$value in $_";
	    }
	    else {
		warn "Ignoring unrecognized field $tag:$value in $_";
	    }
	}
	if ($first_name || $middle_name || $last_name || $nick_name ||
	    $name_prefix || @extra_names) {
	    my($given_name, $given_name2, $name);
	    if (@extra_names && ! $last_name) {
		$last_name = shift @extra_names;
	    }
	    if ($first_name || $middle_name) {
		$given_name = join(" ", ($first_name || "???"),
				   ($middle_name || ()));
		$given_name2 = join(", ", ($first_name || "???"),
				   ($middle_name || ()));
	    }
	    $name = 
		$record->add("NAME",
			     join(" ", ($name_prefix || ()),
				  ($given_name || ()),
				  ($nick_name ? "($nick_name)" : ()),
				  ($last_name ? "/$last_name/" : ())));
	    $name->add("NPFX", $name_prefix) if ($name_prefix);
	    $name->add("GIVN", $given_name2) if ($given_name);
	    $name->add("NICK", $nick_name) if ($nick_name);
	    $name->add("SURN", $last_name) if ($last_name);
	    for my $extra (@extra_names) {
		$record->add("NAME",
			     join(" ",
				  ($given_name || ()), "/$extra/"));
	    }
	}
	$individual_records{$record_identifier} = $record;
    }
    elsif ($record_type eq "F") {
	my $record = $ged->add_family;
	my($married, $divorced);
	for my $ref (@fields) {
	    my($tag, $value) = @{$ref};
	    if ($tag eq "N") {
		# Don't know what to do with this, so skip it
		next;
	    }
	    elsif ($tag eq "H") {
		die "Bad xref to individual $value"
		    if (! $individual_records{$value});
		$record->add("HUSB", $individual_records{$value});
		if (! grep($_ == $value,
			   @{$link_spouse_to_family{$record_identifier}})) {
		    die "Family $record_identifier links husband to $value but not the reverse\n";
		}
		$individual_records{$value}->add("FAMS", $record);
	    }
	    elsif ($tag eq "W") {
		die "Bad xref to individual $value"
		    if (! $individual_records{$value});
		$record->add("WIFE", $individual_records{$value});
		if (! grep($_ == $value,
			   @{$link_spouse_to_family{$record_identifier}})) {
		    die "Family $record_identifier links wife to $value but not the reverse\n";
		}
		$individual_records{$value}->add("FAMS", $record);
	    }
	    elsif ($tag eq "M") {
		$married = $record->add("MARR") if (! $married);
		$married->add("DATE", $value);
	    }
	    elsif ($tag eq "MP") {
		$married = $record->add("MARR") if (! $married);
		$married->add("PLAC", $value);
	    }
	    elsif ($tag eq "MD") {
		$divorced = $record->add("DIV") if (! $divorced);
		$divorced->add("DATE", $value);
	    }
	    elsif ($tag eq "C") {
		$value =~ s/^\s+//;
		$value =~ s/\s+$//;
		my(@children) = split(/[,\s]+/, $value);
		my(%children);
		map($children{$_} = 1, @children);
		foreach my $child (@{$link_child_to_family{$record_identifier}}) {
		    if (! delete $children{$child}) {
			die "Individual $child points at parent $record_identifier but not the reverse\n";
		    }
		    $record->add("CHIL", $individual_records{$child});
		    $individual_records{$child}->add("FAMC", $record);
		}
		if (%children) {
		    map(warn("Family $record_identifier points at child $_ but not the reverse\n"), keys %children);
		    exit(1);
		}
	    }
	    elsif ($tag eq "ADDR") {
		push(@{$ignore_address_link{$value}}, $record_identifier);
		next;
	    }
	    elsif ($tag eq "TNOTE" or $tag eq "GEN" or $tag eq "TGEN" or $tag eq "COM") {
		&add_multiline($record, "NOTE", $value);
	    }
	    elsif ($tag eq "EDITED") {
		$record->add("CHAN")->add("DATE", $value);
	    }
	    elsif ($strict) {
		die "Bad field: $tag:$value in $_";
	    }
	    else {
		warn "Ignoring unrecognized field $tag:$value in $_";
	    }
	}
    }
    elsif ($record_type eq "A") {
	my(@who, $address, $phone);
	for my $ref (@fields) {
	    my($tag, $value) = @{$ref};
	    if ($tag eq "WHO") {
		$value =~ s/^\s+//;
		$value =~ s/\s+$//;
		@who = split(/[,\s]+/, $value);
	    }
	    elsif ($tag eq "ADDR") {
		$address = $value;
	    }
	    elsif ($tag eq "PHONE") {
		$phone = $value;
	    }
	    elsif ($tag eq "EDITED") {
		next;
	    }
	    elsif ($strict) {
		die "Bad field: $tag:$value in $_";
	    }
	    else {
		warn "Ignoring unrecognized field $tag:$value in $_";
	    }
	}
	next if (! ($address || $phone));
	my(%to_link, %to_ignore);
	map($to_link{$_} = 1, @{$link_address_to_individual{$record_identifier}});
	map($to_ignore{$_} = 1, @{$ignore_address_link{$record_identifier}});
	map {
	    if (! ($to_link{$_} or $to_ignore{$_})) {
		die "Address $record_identifier links to $_ but not the reverse\n";
	    }
	} @who;
	for my $id (@who) {
	    next if (delete $to_ignore{$id});
	    delete $to_link{$id};
	    my $resi = $individual_records{$id}->add("RESI");
	    if ($address) {
		$address =~ s/^\s+//;
		$address =~ s/\s+$//;
		my(%fields);
		if ($address =~ s/\s+USA\s*$//) {
		    $fields{"CTRY"} = "US";
		}
		if ($address =~ s/[,\s]+(Apartment\s*[^\s,]+)//) {
		    $fields{"ADR2"} = $1;
		}
		if ($address =~ /(.*),\s*([A-Z]{2})$/) {
		    $fields{"CITY"} = $1;
		    $fields{"STAE"} = $2;
		}
		else {
		    my $ref = Geo::StreetAddress::US->parse_location($address);
		    if (! $ref) {
			if ($strict_addresses) {
			    die "Couldn't parse address: $address";
			}
			else {
			    warn "Ignoring unparseable address: $address";
			}
		    }
		    else {
			$fields{"ADR1"} = &concat($ref->{number},
						  $ref->{prefix},
						  $ref->{street},
						  $ref->{type},
						  $ref->{suffix});
			$fields{"CITY"} = $ref->{city};
			$fields{"STAE"} = $ref->{state};
			$fields{"POST"} = $ref->{zip};
		    }
		}
		if (%fields) {
		    my $addr = $resi->add("ADDR");
		    map($addr->add($_, $fields{$_}), keys %fields);
		}
	    }
	    if ($phone) {
		$resi->add("PHON", $phone);
	    }
	}
	if (%to_link || %to_ignore) {
	    map {
		warn "Record $_ links to address $record_identifier but not the reverse\n";
	    } (keys %to_link, keys %to_ignore);
	    exit 1;
	}
    }
    else {
	die "Unrecognized record type T:$record_type\n$_";
    }
}

$ged->add_trailer;

my($fh, $temp);
if (! $output_file) {
    ($fh, $output_file) = tempfile() or die;
    $temp = 1;
}
else {
    open($fh, "+>", $output_file) or die;
}
$ged->order;
$ged->write($output_file);
if ($temp) {
    while (<$fh>) {
	print;
    }
    unlink($output_file);
}
close($fh);

sub concat {
    my(@values) = @_;
    @values = grep($_, @values);
    join(" ", @values);
}

sub add_multiline {
    my($record, $tag, $value) = @_;
    $value =~ s/^\s+//;
    $value =~ s/(.*)\r?\n?//;
    my $new = $record->add($tag, $1);
    while ($value) {
	$value =~ s/(.*)\r?\n?//;
	$new->add("CONT", $1);
    }
}