package XFST::Lookup;

# $Header: /var/cvsroot/XFST-Lookup/lib/XFST/Lookup.pm,v 1.11 2005/06/10 13:39:37 alex Exp $

$VERSION="0.4";

use strict;
use Carp;

=head1 NAME

XFST::Lookup - PERL extension to access *XEROX*'s lookup-utility

=head1 SYNOPSIS
    
    use XFST::Lookup;
    my $lookup = new XFST::Lookup({debug=>1);

    $lookup->set_flags("some", "flags");
   
    # to use a single FST:
    my %looked_up_wordforms = $lookup->lookup({
        fst => 'path/to_a/finite_state_transducer',
        words => \@wordforms_as_array_REFERENCE
    } );

    # to use a lookup-strategy:
    my %looked_up_wordforms = $lookup->lookup({
        strategy => 'path/to_a/lookup_strategy',
        words => \@wordforms_as_array_REFERENCE
    } );
    
=head1 DESCRIPTION

XFST::Lookup() provides an easy way to access the functionality of the
*XEROX* lookup-utility from within PERL.
I<lookup> itself "applies a pre-compiled transducer or transducers to
look up words".

=cut

sub new {
    my $class = shift;
    my $self = {};

    # handle options?
    while ( my $opt_ref = shift( @_ ) ) {
        croak 'Options not supplied via hashref'
            unless (ref($opt_ref) eq 'HASH' );

        foreach my $key ( keys %{$opt_ref} ) {
            $self->{$key} = $opt_ref->{$key};
            print "setting '$key' to '$opt_ref->{$key}'.\n" if $self->{debug};
        }
    }
    
    _check_lookup($self);
    
    return bless $self, $class;
}

sub _check4bin {
    my $self = shift;
    my $binary = shift;
    my $path2bin;
    
    print "Looking for '$binary'... " if $self->{debug};

    # try to locate xfst's lookup (or any other binary) (in $ENV{PATH})
    # TODO: what about win32? steffi?
    if ( $^O =~ /linux|bsd|darwin/ ) {
        # got a user-defined path? add to $ENV{PATH}
        $ENV{PATH} .= ":$self->{path}" if $self->{path};
        
        # give 'which' a try...
        chomp( $path2bin = `which $binary 2>/dev/null` );
        
        croak "Cannot locate '$binary' in \$PATH ('$ENV{PATH}').\n" .
            "\nYou can add a user-defined path via the 'path'-parameter.\n"
            if ( $? != 0 );
    }
    
    print "$path2bin\n" if $self->{debug};

    $self->{"$binary\_binary"} = $path2bin;
    
    return $path2bin;
}

sub _check_lookup {
    my $self = shift;
    
    # got lookup in place?
    my $bin = _check4bin($self,'lookup');

    my $version_str = `$bin -v 2>&1`;
    $version_str =~ /lookup ([\d\.]+)/;
    (defined $1) ? $self->{lookup_version} = $1 :
        carp "Cannot determine lookup-version!\n";

    # do version specific stuff here if necessary...

    print "Interfacing with *XEROX* lookup version $self->{lookup_version}.\n"
        if $self->{debug};

    return 1;
}

=head1 CONSTRUCTOR
    
You may provide parameters via an anonymous hashref like this:

    my $test = new XFST::Lookup( {
            debug => 1 ,
            flags => 'some flags',
            path => '/home/alex/bin/:/home/roni/bin/',
            fst => 'path/to_a/FST/file'
    } );

=over 2

=item I<debug> [BOOL]

turns very verbose debugging messages on

=item I<flags> [$scalar]

set lookup-flags (see set_flags())

=item I<fst> [path]

set the path to a finite state transducer _once_ this way,
you won't have to set it every time you use lookup()

=item I<strategy> [path]

set the path to the file containing your lookup-strategy _once_ this way,
you won't have to set it every time you use lookup()

=item I<path> [path]

Specify an extra-$PATH or a list of $PATHes in order to help XFST::Lookup
to locate the lookup-binary. (separator: ':')

Per default XFST::Lookup is looking for the binary in $ENV{PATH} on
Linux,*BSD and Mac OS calling the which(1)-utility that is shipped with
every *NIX-distribution.

=back

=head1 METHODS

=head2 B<set_flags()> [@array|$scalar]

=over 2

set_flags() is used to set the flags that'll be used by *XEROX*'s
lookup-utility. See [3] for a brief overview.

Please note: the 'LxL' and 'TxT'-flags are stripped, even if you set them
manually in order to do what you mean (I at least hope so) ;)

=back

=cut

sub set_flags {
    my $self = shift;
    my $flags;
    
    # any number of flags will be valid if supplied in array/scalar-context

    foreach my $flag ( @_ ) {
        print "Adding '$flag' to lookup's flags...\n" if $self->{debug};
        $flags .= ' ' . $flag;
    }

    $self->{lookup_flags} = $flags;
}

=head2 B<lookup()> [{anonymous hashref}]

=over 2

... is the most important method of XFST::Lookup (like the name may already
suggest). Every time lookup() is invoked from the inside of a PERL-program,
*XEROX*'s lookup-utility is called with the given options and all results
are returned as a I<hash of arrays>. This datastructure has been chosen
because there may be ambiguous wordforms so a simple hash won't fullfil
the needs.

=over 2

=item I<fst> [path]

set the patch to the file that contains the finite state transducer to use
('save stack'-format).

B<fst> is optional at this point iff it was already passed to the constructor,
it is forbidden if you use the I<strategy>-parameter at the same time.

Please note: Setting B<fst> again at this point will overwrite the setting for
the current run _only_. see CONSTRUCTOR

=item I<strategy> [path]

set parameter to the path to the stratgy-file if you want to use a
lookup-strategy rather than a single finite state transducer.

Conflicts with the I<fst>-parameter.

Please note: Setting B<strategy> again at this point will overwrite the
setting for the current run _only_. see CONSTRUCTOR

=item I<words> [\@array_REFERENCE]

this parameter is _obligatory_ and has to contain an I<array-reference> to
a list of wordforms that will be analyzed by *XEROX*'s lookup-utility.


=back

=cut

sub lookup { 
    my $self = shift;
    my $opt = shift || croak "No options given!\n";
    
    my %results;
    
    croak "Options not supplied via hashref!\n" unless ( ref($opt) eq 'HASH' );
    
    $opt->{fst} ||= $self->{fst};
    $opt->{strategy} ||= $self->{strategy};
    
    croak "Please set either 'fst' *or* 'strategy'!\n"
        if ( $opt->{fst} && $opt->{strategy} );
   
    croak "You have to set either the 'fst' or the 'strategy'-parameter.\n"
        unless ( $opt->{fst} || $opt->{strategy} );
    
    croak "No wordforms given!\n" unless @$opt{words};
    
    my $lookup_file;
    if ( $opt->{fst} ) {
        $lookup_file= $opt->{fst};
        print STDERR "Using a single FST ('$lookup_file').\n" if $self->{debug};
    }
    else {
        $lookup_file= "-f " . $opt->{strategy};
        print STDERR "Using a lookup-strategy ('$lookup_file').\n"
            if $self->{debug};
    }
   
    print "Using FST '$lookup_file'...\n", scalar(@{$opt->{words}}),
        " words to look up.\n" if $self->{debug};

    # any extra flags that shall be passed to *XEROX*'s lookup?
    my $flags;
    ( $self->{lookup_flags} ) ? $flags = '' :
        $flags="-flags $self->{lookup_flags}";
   
    # Strip lookup's 'LL' and 'TT'-flags (they could break _anything_
    # later on...)
    if ( $flags =~ /T[^T]*T|L[^L]*L/g ) {
        $flags =~ s/(T[^T]*T)|(L[^L]*L)//g;
        print STDERR "Stripping flags... (new: '$flags')\n" if $self->{debug};
    }

    # where shall we direct lookup's STDERR output to?
    # we don't want to parse that one anyway... (infos, stats)
    # TODO: what about win32? :(
    my $dest_stream;
    ( $self->{debug} == 1 ) ? $dest_stream = '&2' :  
        ( ( $^O =~ /linux|bsd|mac/i ) ?
            $dest_stream = '&2' : $dest_stream = '/dev/null' );
    
    # using fork() and stuff in order to get rid of File::Temp...

    my ( $pid, @output ); 
    # coming up next: fork()-magic ;-)
    if( $pid = open( CHILD, "-|")) { # -> parent
        print STDERR "Child forked. PID is $pid.\n" if $self->{debug};

        chomp( @output = <CHILD> );

        close CHILD;
    }
    else { # -> child
        croak "fork() failed: '$!'\n" unless defined $pid;
        print STDERR "Child executes $self->{lookup_binary}." .
            " Output goes to parent.\n" if $self->{debug};

        # open the pipe to lookup...
        open PIPE,
            "| $self->{lookup_binary} $lookup_file $flags 2>$dest_stream"
                or croak "Cannot fork(): $self->{lookup_binary}: '$!'\n";
        # ...and feed in the wordforms.
        print PIPE join("\n",@{$opt->{words}}), "\n";
        
        # terminate child (continuing under parent-pid)
        exit;
    }

    # the follwing class describes all chars that are valid for the
    # transducer's output.
    my $cclass = '\w\d\.-_\+!\"\'\\\//';
    
    # extract information from lookup's output
    foreach my $line ( @output ) {
        chomp $line;
        next unless $line;

        # match the three fields ( 1x input, 2x parts analysis(base+tags) )
        $line =~ /^([$cclass]+)\s+([$cclass]+)\s+([$cclass]+)$/;
        if( $1 && $2 && $3 ) {
            push @{$results{$1}}, "$2$3";
        }
        else {
            carp "Error parsing line: '$line'\n";
        }
    }
    
    return %results;
}

=head1 SEE ALSO

[1] "I<Finite State Morphology>", Kenneth R. Beesley and Lauri Karttunen,
    CSLI Publications Stanford
    
[2] http://www.fsmbook.com

[3] the output of 'lookup -h'

[4] updates of XFST::Lookup may be available at
    http://homepage.rub.de/Alexander.Linke-2 

=head1 AUTHOR

Alex Linke, E<lt>Alexander.Linke-2@ruhr-uni-bochum.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by Alex Linke

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.5 or,
at your option, any later version of Perl 5 you may have available.

=cut

1;
