package Lingua::Tokenize;

use strict;
use Carp;

# $Header: /var/cvsroot/Lingua-Tokenize/lib/Lingua/Tokenize.pm,v 1.15 2005/07/24 15:51:11 alex Exp $
our $VERSION='0.7';

=head1 NAME

Lingua::Tokenize - Perl extension for Tokenization (and Normalization)

=head1 SYNOPSIS

  use Lingua::Tokenize;
  
  my $tokenizer = new Lingua::Tokenize( { debug => 1 } );

  my @tokens = $tokenizer->tokenize( {
          normalize => 1,
          tokendef => 'words',
          input => \$wholeCorpusInAScalar
  } );

  my @types = $tokenizer->unify( \@tokens );

  my @sentences = $tokenizer->extract_sentences( \@tokens );

=head1 DESCRIPTION

Lingua::Tokenize provides all methods necessary to tokenize and normalize
I<*german*> text from within Perl.

=cut

sub new {
    my $class = shift;
    my $self = {};

    # handle options?
    while ( my $opt_ref = shift( @_ ) ) {
        croak 'Options not supplied via hashref'
            unless (ref($opt_ref) eq 'HASH' );

        foreach my $key ( keys %{$opt_ref} ) {
            $self->{$key} = $opt_ref->{$key};
            print "setting '$key' to '$opt_ref->{$key}'.\n" if $self->{debug};
        }
    }
    
    return bless $self, $class;
}

=head1 METHODS

=head2 B<unify()> [\@arrayref]

=over 2

unify() returns all types contained in an \@arrayref of tokens -
either as an hash (providing occurence counts as values) or as an array.

as usual, the return-value is determined by the context unify() is called
from.

**NOTE: at the moment, only an @array is returned :(

=back

=cut

sub unify {
    my $self = shift;
    my $words = shift;

    croak "No words given!\n" unless ( $words );
    croak "Words not supplied via ARRAY-ref!\n"
        unless ( ref $words eq 'ARRAY' );

    my %types;
    my $tokens;
    
    foreach my $token ( @{$words} ) {
        $types{$token}++; $tokens++;
    }

    print "unify: ", scalar( keys %types ), " types from $tokens tokens.\n"
        if $self->{debug};

    # TODO: how to differ between want[hash|array]?
    # at the moment, just return an array...
    return keys %types;
}

=head2 B<tokenize()> { anonymous => hash }

=over 2

tokenize() segments a given string of input into single elements called
tokens and returns all of them in array-context.

the following options may be supplied:

=over 2

=item normalize [bool]

Shall the input be normalised before tokenization?

Please NOTE: Only german normalization-rules are implemented at the moment.

=item input  [\$scalar-ref]

Set the input that shall be tokenized. Must be given as scalar-reference.

=item tokendef ['words'|'sentences']

setting tokendef to I<'sentences'> will behave like a shortcut: all tokens
will be passed to extract_sentences() and the sentences returned instead
of the word-tokens.

=back

=cut

sub tokenize {
    my $self = shift;
    my $opt = shift;

    my @tokens;
    my $tokendef;
    
    croak "No options supplied!\n" unless $opt;
    croak "Options not given via HASH-ref!\n" unless ( ref $opt eq 'HASH' );
    croak "No input given!\n" unless $opt->{input};

    unless ( $opt->{tokendef} ) {
        carp "Var 'tokendef' has no value. Assuming 'words'.";
        $tokendef = 'words';
    }
    else { $tokendef = $opt->{tokendef} }
    
    # we'll follow a new char-based processing approach to tokenization
   
    my $content = ${$opt->{input}};
    # normalize...
    $content =~ s/[\n\r\s]+/ /g;
        
    # define character classes
    my $cl_finite   = '\!\?';          # without stop
    my $cl_quote    = '\"\'\`\';
    my $cl_chars    = 'A-Za-z';
    my $cl_nf_punct = '\;\,\:\(\)';
    my $special     = '^(?:\w\d\.\-\_)';

    my @slice = split //, $content;
    my ( $buffer, $char );
    
    # this whole approach may seem silly, but it is quite fast and the
    # results are ok, too...
    # TODO: add documentation for each rule...
    while ( @slice ) {
        $char = shift @slice;
        # --> 1. rule
        if ( $char =~ /[$cl_chars]/ ) {
            $buffer .= $char;
        }
        # --> 2. rule
        elsif( $char eq ' ' ) {
            push @tokens, $buffer if $buffer;
            $buffer = '';
        }
        # --> 3. rule 
        elsif( $char =~ /[$cl_finite\.]/ && $slice[0] =~ /[\s$cl_quote]/ ) {
            # end of sentence?
            # --> part a)
            if ( $slice[1] && ( $slice[1] =~ /[A-Z$cl_quote\s]/ && 
                        ! ($buffer =~ /^[0-9]+$/)  )
                ||
                ! $slice[1] ) {
                push @tokens, $buffer if $buffer;
                push @tokens, $char;
                $buffer = '';
            }
            # --> part b)  ( "1. July" )
            else {
                $buffer .= $char;
                push @tokens, $buffer if $buffer;
                $buffer = '';
            }
        }
        # quotation marks?
        # --> 4. rule 
        elsif ( $char =~ /[$cl_quote$cl_nf_punct]/ && 
                ! ( $buffer =~ /^[0-9]$/)) {
            push @tokens, $buffer if $buffer;
            $buffer = '';
            push @tokens, $char;
        }
        # date, time
        # --> 5. rule 
        elsif ( 
            ($char =~ /[0-9]/ && ($buffer eq '' || $buffer =~ /[0-9\.\:]/))
                ||
            ($char =~ /[\.\:]/ && ( $buffer =~ /[0-9]/ )) ) {
                $buffer .= $char;
        }
        # special char 
        # --> 6. rule 
        elsif ( $char =~ /[$special]/ && ! ( $buffer =~ /^[0-9]$/)) {
            push @tokens, $buffer;
            $buffer = '';
            push @tokens, $char;
        }
        # --> 7. rule
        else {
            $buffer .= $char;
        }
    }    

    @tokens = _normalize( \@tokens, $self->{debug}, "$cl_finite\." )
        if ( $opt->{normalize} );
    
    @tokens = extract_sentences( $self, \@tokens )
        if ( $tokendef eq 'sentences' );
    
    return @tokens;
}

sub _normalize {
    my @input = @{shift()};
    my $debug = shift;
    my $cl_final = shift;

    my %filter;
    
    print "building corpus-filter... (" if $debug;
    $filter{$_}++ foreach ( @input );
    print scalar( keys %filter), " types)\n" if $debug;
    
    print "normalizing... (", scalar(@input), " elements)\n" if $debug;
    for( my $i=-1; $i < scalar( @input );  $i++ ) {
        next unless ( $input[$i] =~ /^[$cl_final]$/ || $i == -1 );
        # look for a candidate to normalize within the next _two_ cells
        # (fixes problems with stuff like this ->
        # ==>  sagte er." Dieser wiederum )
        my $candidate; my $pos;
        if ( $input[$i+1] && $input[$i+1] =~ /^[A-Z]/ ) {
            $candidate = $input[$i+1]; $pos = $i+1;
        }
        elsif ( $input[$i+2] && $input[$i+2] =~ /^[A-Z]/ ) {
            $candidate = $input[$i+2]; $pos = $i+2;
        }
        
        next unless $candidate;
        
        print "\tnormalize '$candidate' [pos: $pos]?\t" if $debug;
        $input[$pos] = lcfirst ( $input[$pos] )
            if ( $filter{lcfirst($input[$pos])} &&
                    $filter{lcfirst($input[$pos])} >= $filter{$input[$pos]} );
        print "-> '$input[$pos]'\n" if $debug;
    }

    return @input;
}

=back

=head2 B<extract_sentences()>

=over 2

extract_sentences() extracts all sentences from a given arrayref and returns
them as an array of sentences. in order to gain appropriate results it is
important to pass an array that was generated by tokenize().

Here's an example:

=over 2
    
    my @sentences = $tokenize->extract_sentences( \@wordtokens );

=back

=cut

sub extract_sentences {
    my $self = shift;
    my $data = shift;
    
    my @sentences ;
    my $buffer;
    
    # character-class definitions:

    # every element that consists _only_ of one of these is believed to be
    # a sentence ending one...
    my $ending = '\!\?\.'; 
    
    foreach my $token ( @{$data} ) {
        $buffer .= "$token ";
        
        if ( $token =~ /^[$ending]$/ ) {
            # remove trailing whitespace
            $buffer =~ s/\s$//;
            
            push @sentences, $buffer;
            print "Sentence: '$buffer'\n" if $self->{debug};
            $buffer = '';
        }
    }

    return @sentences;
}

=head1 SEE ALSO

[1] "What is a word, What is a sentence? Problems of Tokenization",
    Gregory Grefenstette, Pasi Tapanainen

Visit "http://homepage.rub.de/Alexander.Linke-2/" for updates.

=head1 AUTHORS

Alex Linke, E<lt>Alexander.Linke-2@rub.deE<gt>

Rona Linke, E<lt>Rona.Linke@rub.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2005 by Alex and Rona Linke

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.5 or,
at your option, any later version of Perl 5 you may have available.


=cut

1;
