package XFST::Lookup;

# $Header: /var/cvsroot/XFST-Lookup/lib/XFST/Lookup.pm,v 1.29 2005/07/24 15:53:17 alex Exp $

$VERSION="0.11";

use strict;
use Carp;

=head1 NAME

XFST::Lookup - Perl extension to access *XEROX*'s lookup-utility

=head1 SYNOPSIS
    
    use XFST::Lookup;
    my $lookup = new XFST::Lookup({debug=>1);

    $lookup->set_flags("some", "flags");
   
    # to use a single FST:
    my %looked_up_wordforms = $lookup->lookup({
        fst => 'path/to_a/finite_state_transducer',
        words => \@wordforms_as_array_REFERENCE
    } );

    # to use a lookup-strategy:
    my %looked_up_wordforms = $lookup->lookup({
        strategy => 'path/to_a/lookup_strategy',
        words => \@wordforms_as_array_REFERENCE
    } );
    
=head1 DESCRIPTION

XFST::Lookup() provides an easy way to access the functionality of the
*XEROX* lookup-utility from within Perl.
I<lookup> itself "applies a pre-compiled transducer or transducers to
look up words".

=cut

sub new {
    my $class = shift;
    my $self = {};

    # handle options?
    while ( my $opt_ref = shift( @_ ) ) {
        croak 'Options not supplied via hashref'
            unless (ref($opt_ref) eq 'HASH' );

        foreach my $key ( keys %{$opt_ref} ) {
            $self->{$key} = $opt_ref->{$key};
            print "setting '$key' to '$opt_ref->{$key}'.\n" if $self->{debug};
        }
    }
    
    _check_lookup($self);
    
    return bless $self, $class;
}

sub _check4bin {
    my $self = shift;
    my $binary = shift;
    my $path2bin = "";
    my $found = 0;
    
    print "Looking for '$binary'... " if $self->{debug};

    # try to locate xfst's lookup (or any other binary) (in $ENV{PATH})
    if ( $^O =~ /linux|bsd|darwin|cygwin/ ) {
        # got a user-defined path? add to $ENV{PATH}
        $ENV{PATH} .= ":$self->{path}" if $self->{path};
        
        # give 'which' a try...
        chomp( $path2bin = `which $binary 2>/dev/null` );
    
        ( $? == 0 ) ? $found = 0 : $found = 1;
    }
    elsif ( $^O eq "MSWin32" ) {
        my $ms_binary="$binary\.exe";
        if( system("$ms_binary -v") == 0) {
            print "MSWin32: $ms_binary found in standard-path...\n"
                if $self->{debug};
            $found=1;
            $path2bin=$ms_binary;
        }
        elsif( $self->{path}  && system("$self->{path}\\$ms_binary -v") == 0 ) {
            print "MSWin32: $ms_binary found in user-defined path: " .
                "'$self->{path}'\n" if $self->{debug};
            $found = 1;
            $path2bin="$self->{path}\\$ms_binary";
        }
    }
    
    croak "Cannot locate '$binary' in \$PATH ('$ENV{PATH}').\n" .
        "\nYou can add a user-defined path via the 'path'-parameter.\n"
        unless( $found );
    
    print "$path2bin\n" if $self->{debug};

    $self->{"$binary\_binary"} = $path2bin;
    
    return $path2bin;
}

sub _check_lookup {
    my $self = shift;
    
    # got lookup in place?
    my $bin = _check4bin($self,'lookup');

    my $version_str = `$bin -v 2>&1`;
    $version_str =~ /lookup ([\d\.]+)/;
    
    if (defined $1) {
        $self->{lookup_version} = $1;
    }
    else {
        carp "Cannot determine lookup-version!\n";
        $self->{lookup_version} = ">unknown<";
    }

    # do version specific stuff here if necessary...

    print "Interfacing with *XEROX* lookup version $self->{lookup_version}.\n"
        if $self->{debug};

    return 1;
}

=head1 CONSTRUCTOR
    
You may provide parameters via an anonymous hashref like this:

    my $lookup = new XFST::Lookup( {
            debug => 1 ,
            flags => 'some flags',
            path => '/home/alex/bin/:/home/roni/bin/',
            fst => 'path/to_a/FST/file'
    } );

=over 2

=item I<debug> [BOOL]

turns very verbose debugging messages on

=item I<flags> [$scalar]

set lookup-flags (see set_flags())

=item I<fst> [path]

set the path to a finite state transducer _once_ this way,
you won't have to set it every time you use lookup()

=item I<strategy> [path]

set the path to the file containing your lookup-strategy _once_ this way,
you won't have to set it every time you use lookup()

=item I<path> [path]

I<Linux/*BSD/MacOS>:

=over 2

Specify a single addition or a list of additions to $PATH in order to help
XFST::Lookup to locate the lookup-binary. (separator: ':')

Per default XFST::Lookup is looking for the binary in $ENV{PATH} on
Linux,*BSD and Mac OS calling the which(1)-utility that is shipped with
every *NIX-distribution.

=back

I<Win32>:

=over 2

On MSWin32 XFST::Lookup tries to locate the binary in the defaut path or
searches a user-defined path afterwards (i.e. supplied via
"path=>d:\alex\xerox-windows\")

Manually setting the path on this OS is recommended...

=back

=head1 METHODS

=head2 B<set_flags()> [@array|$scalar]

=over 2

set_flags() is used to set the flags that will be used by *XEROX*'s
lookup-utility. See [3] for a brief overview.

Please note: even if you set them manually, the 'LxL' and 'TxT'-flags are
stripped, in order to do what you mean. XFST::Lookup needs the default
separators in order to correctly process the output of lookup.

=back

=cut

sub set_flags {
    my $self = shift;
    my $flags;
    
    # any number of flags will be valid if supplied in array/scalar-context

    foreach my $flag ( @_ ) {
        print "Adding '$flag' to lookup's flags...\n" if $self->{debug};
        $flags .= ' ' . $flag;
    }

    $self->{lookup_flags} = $flags;
}

=back

=head2 B<lookup()> [{anonymous hashref}]

=over 2

... is the most important method of XFST::Lookup (like the name may already
suggest). Every time lookup() is invoked from the inside of a Perl-program,
*XEROX*'s lookup-utility is called with the given options and all results
are returned as a I<hash of arrays>. This datastructure has been chosen
because there may be ambiguous wordforms so a simple hash won't fulfill
the needs.

Here's an example of the mentioned datastructure (Data::Dumper-format):

    $VAR1 = 'eine';
    $VAR2 = [
              'ein+ART+NOM+SG',
              'ein+ART+AKK+SG',
              'ein+PIS+AKK+SG',
              'ein+PIS+NOM+SG',
              'ein+PIAT+AKK+SG',
              'ein+PIAT+NOM+SG',
              'eine+PIDAT'
            ];
    $VAR3 = 'Haus';
    $VAR4 = [
              'Haus+NN+NOM+SG',
              'Haus+NN+DAT+SG',
              'Haus+NN+AKK+SG'
            ];
    $VAR5 = 'das';
    $VAR6 = [
              'das+ART+NOM+SG',
              'das+ART+AKK+SG',
              'das+KOUS',
              'der+PDAT+NOM+SG',
              'der+PDAT+AKK+SG',
              'der+PDS+NOM+SG',
              'der+PDS+AKK+SG'
            ];
    ...

While the hash's key contains a I<wordform>, the value provides an @array
of all analyzed I<readings> of this wordform.

The following parameters are available to alter XFST::Lookup's runtime
behaviour:

=over 2

=item I<fst> [path]

set the path to the file that contains the finite state transducer to use
('save stack'-format).

I<fst> is optional at this point iff it was already passed to the constructor,
it is forbidden if you use the I<strategy>-parameter at the same time.

Please note: Setting B<fst> again at this point will overwrite the setting for
the current run _only_. see CONSTRUCTOR

=item I<strategy> [path]

set parameter to the path to the strategy-file if you want to use a
lookup-strategy rather than a single finite state transducer.

Conflicts with the I<fst>-parameter.

Please note: Setting B<strategy> again at this point will overwrite the
setting for the current run _only_. see CONSTRUCTOR

=item I<words> [\@array_REFERENCE]

this parameter is _obligatory_ and has to contain an I<array-reference> to
a list of wordforms that will be analyzed by *XEROX*'s lookup-utility.


=back

=cut

sub lookup { 
    my $self = shift;
    my $opt = shift || croak "No options given!\n";
    
    my %results;

    my %seen;
    
    croak "Options not supplied via hashref!\n" unless ( ref($opt) eq 'HASH' );
    
    $opt->{fst} ||= $self->{fst};
    $opt->{strategy} ||= $self->{strategy};
    
    croak "Please set either 'fst' *or* 'strategy'!\n"
        if ( $opt->{fst} && $opt->{strategy} );
   
    croak "You have to set either the 'fst' or the 'strategy'-parameter.\n"
        unless ( $opt->{fst} || $opt->{strategy} );
    
    croak "No wordforms given!\n" unless @$opt{words};
    
    my $lookup_file;
    if ( $opt->{fst} ) {
        $lookup_file= $opt->{fst};
        print STDERR "Using a single FST ('$lookup_file').\n"
            if $self->{debug};
        croak "File '$lookup_file' doesn't exist!\n" unless -e $lookup_file;
    }
    else {
        croak "File '$lookup_file' doesn't exist!\n"
            unless -e $opt->{strategy};
        $lookup_file= "-f " . $opt->{strategy};
        print STDERR "Using a lookup-strategy ('$opt->{strategy}').\n"
            if $self->{debug};
    }

    print "Using FST '$lookup_file'...\n", scalar(@{$opt->{words}}),
        " words to look up.\n" if $self->{debug};

    # Any extra flags that shall be passed to *XEROX*'s lookup?
    my $flags;
    unless ( $self->{lookup_flags} ) { $flags = '' }
    else { $flags="-flags" . ' ' . $self->{lookup_flags} }
   
    # Strip lookup's 'LL' and 'TT'-flags (they could break _everything_
    # later on...)
    if ( $flags =~ /T[^T]*T|L[^L]*L/g ) {
        $flags =~ s/(T[^T]*T)|(L[^L]*L)//g;
        print STDERR "Stripping flags... (new: '$flags')\n" if $self->{debug};
    }

    # Where shall we direct lookup's STDERR output to?
    # We don't want to parse that one anyway... (infos, stats)
    # NOTE: Doesn't work on MS Windows...
    my $dest_stream;
    ( $self->{debug} == 1 ) ? $dest_stream = '&2' :  
        ( ( $^O =~ /linux|bsd|darwin/i ) ?
            $dest_stream = '&2' : $dest_stream = '/dev/null' );
    
    # Using fork() and stuff in order to get rid of File::Temp...
    # ...iff not running on MS Windows...
    
    my ( $pid, @output );

    if ( $^O eq "MSWin32" ) {
        # This is a hack - I am aware of that :(
        # I don't know how to do it any better on Win32, because I am not
        # familiar with that OS at all.

        # Here's what went wrong/didn't work/caught fire:
        #  o Fcntl (flock) doesn't work at all (see perlport-function)
        #  o wait/waitpid doesn't work as expected (doesn't seem to threat
        #    a pipe as a child-process - but it should (according to
        #    perlport-function again), because perl is able to emulate fork()
        #    although win32 generally lacks support for this syscall.
        #  o IPC::Open2 didn't work well - in fact it refused to let me read
        #    any data back from the pipe
        #    
        #    ...so File::Temp and a pseudo-cat are the last resort :(       
        
        print "Using File::Temp on Win32 :(\n" if $self->{debug};

        use File::Temp;
        
        my $tmp = new File::Temp( UNLINK => 1, SUFFIX => '.txt' );

        print "Tempfile is '$tmp'\n" if $self->{debug};

        # write all wordforms to the tempfile...
        print $tmp join("\n", @{$opt->{words}}), "\n";
        
        # ... use Perl as kind of cat-command to pipe the wordforms to
        # lookup. This looks terribly ugly, but the Win32 echo-command
        # doesn't even seem to handle escape-sequences, which are required
        # in order to get some useful results out of lookup.
        open WINPIPE,
            "perl -e \"open FH, '$tmp' or die; print foreach <FH>\"" . 
            " | $self->{lookup_binary} $lookup_file $flags |" or
            die "Failed to open pipes: '$!'\n";

        @output = <WINPIPE>;
    }   
    else {  
        # running on *NIX -- using fork()
        if( $pid = open( CHILD, "-|") ) { # -> parent
            print STDERR "Child forked. PID is $pid.\n" if $self->{debug};

            chomp( @output = <CHILD> );

            close CHILD;
        }
        else { # -> child
            croak "fork() failed: '$!'\n" unless defined $pid;
            print STDERR "Child executes $self->{lookup_binary}." .
                " Output goes to parent.\n" if $self->{debug};

            # open the pipe to lookup...
            open PIPE,
                "| $self->{lookup_binary} $lookup_file $flags 2>$dest_stream"
                    or croak "Cannot fork(): $self->{lookup_binary}: '$!'\n";
            # ...and feed in the wordforms.
            print PIPE join("\n",@{$opt->{words}}), "\n";
        
            # terminate child (continuing under parent-pid)
            exit;
        }
    }

    # extract information from lookup's output
    foreach my $line ( @output ) {
        # cygwin's lookup-output contains "\r", so we can't just chomp()
        $line =~ s/[\n\r]//g;
        next unless $line;

        # match the three fields ( 1x input, 2x parts analysis(base+tags) )
        $line =~ /^([^\s]+)\s+([^\s]+)\s+([^\s]+)$/;
        if ( $1 && $2 && $3 ) {
            unless( $seen{$1}->{"$2$3"} ) {
                push @{$results{$1}}, "$2$3";
                $seen{$1}->{"$2$3"} = 1;
            }
        }
        else {
            carp "Error parsing line: '$line'\n";
        }
    }

    return %results;
}

=head1 SEE ALSO

[1] "I<Finite State Morphology>", Kenneth R. Beesley and Lauri Karttunen,
    CSLI Publications Stanford
    
[2] http://www.fsmbook.com

[3] the output of 'lookup -h'

[4] updates of XFST::Lookup may be available at
    http://homepage.rub.de/Alexander.Linke-2 

=head1 AUTHOR

Alex Linke, E<lt>Alexander.Linke-2@ruhr-uni-bochum.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by Alex Linke

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.5 or,
at your option, any later version of Perl 5 you may have available.

=cut

1;
