=head1 NAME

EPrints::Plugin::Import::RIS

Author: Stewart Brownrigg, University of Kent, 10 Feb 2012

=cut


package EPrints::Plugin::Import::RIS;

use EPrints::Plugin::Import::TextFile;
use strict;
use Data::Dumper;

our @ISA = ('EPrints::Plugin::Import::TextFile');

sub new
{
        my( $class, %params ) = @_;

        my $self = $class->SUPER::new( %params );

        $self->{name} = 'Refworks/ReferenceManger (RIS format)';
        $self->{visible} = 'all';
        $self->{produce} = [ 'list/eprint' ];

        return $self;
}

sub input_fh
{
    my( $plugin, %opts ) = @_;
    my @ids;
    my $fh = $opts{fh}; # File handle
    my @file = <$fh>;
    my ( %record, @records ) = ();
    my $lastkey = undef;

    foreach my $row (@file) # read in records, one row at a time
    {
        $row =~ tr/\x{feff}//d; # remove BOM - only appears at beginning of file (find a better way to remove 
        if ( my %row = split(/\s\s-\s/, $row) ) # split the row into field code => data on '  - ', discard any other 
        {
            for my $key ( keys %row )
            {
                if ( $key eq 'ER' )  # End of Record marker 'ER  - '.  At this point we have a complete RIS record - now we can create an eprint record
                {
                    my $epdata = $plugin->convert_input(\%record); # Convert the RIS record to an EPrint record
#                     print Dumper \%record;
                    %record = (); # Clear %record for next publication
                    $lastkey = undef;
                    next unless defined $epdata; 
#                     print Dumper $epdata;
    
                    my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata);
                    if( defined $dataobj )
                    {
                        push @ids, $dataobj->get_id;
                    }
                }
                elsif ( $key =~ /^[0-9a-z]{2}$/i && defined $row{$key} ) # a valid field has a two character key and defined value
                {
                    $row{$key} =~ s/[\r\n]$//g; # remove any trailing eol characters
                    push @{$record{$key}}, $row{$key};  # all values are added to arrays just in case there is more than one value.
                    $lastkey = $key;                    
                }
                elsif ( $key != '' ) # catch multi line (wrapped) fields
                {
                   push @{$record{$lastkey}}, $key;                
                }
            }
        }
    }

    return EPrints::List->new(
                    dataset => $opts{dataset},
                    session => $plugin->{session},
                    ids=>\@ids );
}

sub convert_input
{
	my ( $plugin, $entry ) = @_;
 	my ( $epdata ) = ();
 	my ( $unmapped ) = [];

 	my $eptypes = { # hash table for mapping RIS types to EPrint types
 	    'ABST'   => 'article',
 	    'EJOUR'  => 'article',
 	    'JFULL'  => 'article',
 	    'JOUR'   => 'article',
 	    'MAG'    => 'article',
 	    'MGZN'   => 'article',
 	    'NEWS'   => 'article',
 	    'BOOK'   => 'book',
 	    'EBOOK'  => 'book',
 	    'EDBOOK' => 'edbook',
 	    'CONF'   => 'conference_item',
 	    'CPAPER' => 'conference_paper',
 	    'CHAP'   => 'book_section',
 	    'DATA'   => 'dataset',
 	    'AGGR'   => 'dataset',
 	    'DBASE'  => 'dataset',
 	    'RPRT'   => 'research_report',
 	    'PAT'    => 'patent',
 	    'THES'   => 'thesis',
 	    'SLIDE'  => 'image',
 	    'MUSIC'  => 'audio',
 	    'SOUND'  => 'audio',
 	    'VIDEO'  => 'video',
 	    'BLOG'   => 'internet',
 	    'MULTI'  => 'internet',
 	    'ELEC'   => 'internet',
 	    'PAMP'   => 'monograph',
        'PAT'    => 'patent',
        'INPR'   => 'other',
        'UNPB'   => 'other',
        'GEN'    => 'other'
 	};

    foreach my $type ( @{ $entry->{TY} } )
    {
        if ( !defined $eptypes->{$type} )
        {
            $type = 'GEN';
        }

        $epdata->{type} = $eptypes->{$type};  # map RIS types to EPrint types

        # Process reviews first
        if ( defined $entry->{RI} && defined $entry->{C4} && !grep /$type/, ('MPCT','GRANT') )
        {
            $epdata->{type} = 'review';
            $type = 'REVIEW';
        }

        #
        # Process other type-dependent fields
        # Add exceptions to the norm here
        # General catch-all rules are declared outside this look
        #

        # Process authors, creators, editors, etc. Mapping varies according to type
        &_process_names($epdata, $entry, $type);

        # Date type/publication status - published, submitted or completion (unused)
        if ( grep /$type/, ('UNPB','INPR','THES','RPRT') )
        {
            $epdata->{date_type} = 'submitted';

            if ( $type eq 'INPR' )
            {
                $epdata->{ispublished} = 'inpress';
            }
            elsif ( $type eq 'UNPB' )
            {
                $epdata->{ispublished} = 'unpub';
            }
            else
            {
                $epdata->{ispublished} = 'submitted';
            }
        }
        else
        {
            $epdata->{date_type} = 'published';
            $epdata->{ispublished} = 'pub';
        }

        # CY - Place of publication / Location of conference (event)
        if ( grep /$type/, ('CPAPER','CONF') )
        {
            &_join_field_data($epdata, $entry, 'CY', 'event_location', ', ');
        }

        # IS - Number of volumes CHAP otherwise Issue
        if ( grep /$type/, ('CHAP') )
        {
            &_join_field_data($epdata, $entry, 'IS', 'num_pieces', ', ');
        }

        # SN - ISSN
        if ( grep /$type/, ('ABST','INPR','JFULL','JOUR','AGGR','DATA', 'EJOUR','MGZN', 'MUSIC','NEWS' ) )
        { 
            &_join_field_data($epdata, $entry, 'SN', 'issn', '; ');
        }

        # VL - Volume / Other
        if ( grep /$type/, ('BLOG','THES') )
        {
            &_store_unmapped($epdata, $entry, 'VL', $unmapped );
        }
        elsif ( grep /$type/, ('CHART','EQUA','FIGURE') )
        {
            &_join_field_data($epdata, $entry, 'VL', 'size', ', ');
            &_store_unmapped($epdata, $entry, 'A2', $unmapped, 'Field not mapped to EPrints' );
        }

        # T2 - Book/Volume title / Series title / Publication / Conference name
        if ( grep /$type/, ('CPAPER','CONF','HEAR','UNPB' ) )
        { 
            &_push_array_field_data($epdata, $entry, 'T2', 'event_title');
        }
        elsif ( grep /$type/, ('CHAP','ECHAP','ENCYC','EQUA','FIGURE','MUSIC') )
        {
            &_join_field_data($epdata, $entry, 'T2', 'book_title', ', ');
        }
        elsif ( grep /$type/, ('BOOK', 'CTLG','CLSWK','COMP','MPCT','MAP','UNPB','ELEC') )
        {
            &_join_field_data($epdata, $entry, 'T2', 'series', ', ');
        }

        # THES - type specific 
        if ( $type eq 'THES' )
        {
            &_join_field_data($epdata, $entry, 'T2', 'department');
            &_join_field_data($epdata, $entry, 'PB', 'institution');
        }

        # T3 - Tertiary title / Series title / Corporation
        if ( grep /$type/, ('BILL','BLOG','HEAR','UNPB' ) )
        { 
            &_push_array_field_data($epdata, $entry, 'T3', 'corp_creators');
        }
        
        delete $entry->{TY};
    }

    # The rest: not type dependent, or left over after picking out specific cases above

    # Title
    &_join_multiple_field_data($epdata, $entry, ['T1','TI'], 'title');

    # Publication title
    &_join_multiple_field_data($epdata, $entry, ['T2', 'JF'], 'publication', ', ');

    # Series title
    &_join_field_data($epdata, $entry, 'T3', 'series', ', ');

    # Abstract
    &_join_multiple_field_data($epdata, $entry, ['AB','N2'], 'abstract');

    # Caption
    &_join_field_data($epdata, $entry, 'CA', 'commentary', ', ');

    # source repository id of bib information
    &_join_field_data($epdata, $entry, 'ID', 'original_repository_id');

    # source repository of bib information
    &_join_field_data($epdata, $entry, 'NV', 'num_pieces');

    # NV - Number of volumes
    &_join_field_data($epdata, $entry, 'SO', 'original_repository');

    # keywords
    &_join_field_data($epdata, $entry, 'KW', 'keywords', ', ');
    
    # CY - Place of publication
    &_join_field_data($epdata, $entry, 'CY', 'place_of_pub', ', ');

    # DOI / NIHMSID / CFDA / PMCID
    &_return_first_value($epdata, $entry, ['DO','C7','C6'], 'id_number', $unmapped);

    # SN - ISBN (ISSN are caught earlier
    &_join_field_data($epdata, $entry, 'SN', 'isbn', '; ');

    # UR - URL
    &_process_urls($epdata, $entry);

    # PB - Publisher
    &_join_field_data($epdata, $entry, 'PB', 'publisher', ', ');

    # M1/IS - Issue number
    &_join_multiple_field_data($epdata, $entry, ['IS','M1'], 'number', ', ');

    # VL - Volume numbering
    &_join_field_data($epdata, $entry, 'VL', 'volume', ', ');

    # RI - Reviewed item
    &_join_field_data($epdata, $entry, 'RI', 'reviewed_item', ', ');

    # SP/EP - Pages & pagerange. Pages can be in format:
    #    SP  - [start page]-[end page]
    # or SP  - [start page];  EP  - [end page]
    my $sp = defined $entry->{SP} ? join('', @{$entry->{SP}}) : undef;
    delete $entry->{SP};
    my $ep = defined $entry->{EP} ? join('', @{$entry->{EP}}) : undef;
    delete $entry->{EP};
    if ( $sp =~ /^[0-9]*-[0-9]*$/ )
    {
        $epdata->{pagerange} = $sp;
        my ($start_page, $end_page) = split(/-/, $sp);
        $epdata->{pages} = ($end_page - $start_page) + 1;
    }
    elsif ( defined $ep )
    {
        $epdata->{pagerange} = "$sp-$ep";
        $epdata->{pages} = ($ep - $sp) + 1;
    }
    elsif ( defined $sp )
    {
        $epdata->{pages} = int $sp;
    }
    
    # Date of publication - Take the first 4 digit match
    &_process_dates($epdata, $entry, ['PY','Y1','Y2','DA'], $unmapped);

    # N1 - Notes
    &_join_field_data($epdata, $entry, 'N1', 'note');

    # Process any leftovers and add $unmapped fields to the notes field
    &_process_unmapped($epdata, $entry, $unmapped);

    return $epdata;
}


sub _store_unmapped
{
    my ( $epdata, $entry, $risfield, $unmapped, $reason ) = @_;

    foreach my $field_value ( @{$entry->{$risfield}} )
    {
        if ( @{$unmapped} == 0 )
        {
            push @{$unmapped}, 'Unmapped bibliographic data:';
        }

        push @{$unmapped}, "$risfield  - $field_value [$reason]";
    }
    delete $entry->{$risfield};
}

sub _process_unmapped
{
    # append unmapped fields to the notes field

    my ( $epdata, $entry, $unmapped ) = @_;

    foreach my $risfield (keys %{$entry})
    {
        if ($risfield =~ /[a-z0-9]/i)
        {
            foreach my $risstring (@{$entry->{$risfield}})
            {
                &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'Field not mapped to EPrints' );
            }
        }
    }

    if ( @{$unmapped} > 0 )
    {
        my $string = join("\r\n", @{$unmapped});
        defined $epdata->{note}
            ? $epdata->{note} .= "\r\n" . $string
            : $epdata->{note} = $string;
    }
}


sub _process_urls
{
#    If there are multiple URLs then the first one goes in official_url, other into related urls  
# 
#     _join_field_data(
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#     );

    my ( $epdata, $entry ) = @_;

    foreach my $url ( @{ $entry->{UR} } )
    {
        if ( defined $epdata->{official_url} )
        {
            push @{$epdata->{related_url}}, { url => $url} ;
        }
        else
        {
            $epdata->{official_url} = $url;
        }
    }
    delete $entry->{UR};
}

sub _process_dates
{
    my ( $epdata, $entry, $risfields, $unmapped ) = @_;

    foreach my $risfield ( @{$risfields} )
    {
        # continue until we have $epdata->{date_year} - don't bother looping through arrays - get
        # first date and store the rest
        foreach my $date_string (@{$entry->{$risfield}})
        {
            if ( !defined $epdata->{date_year} )
            {
                my ($year, $month, $day, $other) = split('/', $date_string);
                $epdata->{date}  = $year if $year =~ /^[0-9]{4}$/;
                $epdata->{date} .= '-' . $month if $month =~ /^[0-9]{2}$/;
                $epdata->{date} .= '-' . $day if $day =~ /^[0-9]{2}$/;
                if ( defined $other )
                {
                    &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' );
                }
            }
            else
            {
                &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' );
            }
        }
        delete $entry->{$risfield};
    }
}

sub _return_first_value
{
#    Take an array of RIS fields and pass back the first value encountered.
#    Useful for sifting through a prioritised list of fields looking for a single value, where we 
#    are certain that there is little chance of subsequent values - and if there are, of little
#    value
#
#     _return_first_value
#     (
#         $epdata    array ref        <eprint data>                     (required)
#         $entry     array ref        <RIS data>                        (required)
#         $risfields array ref        <RIS field names to parse>        (required)
#         $epfield   string           <destination EPrint fieldname>    (required)
#         $unmapped  array ref        <array to store unused values>    (required)
#     );
    my ( $epdata, $entry, $risfields, $epfield, $unmapped ) = @_;

    foreach my $risfield ( @{$risfields} )
    {
        if ( defined $entry->{$risfield} && !defined $epdata->{$epfield} )
        {
            $epdata->{$epfield} = $entry->{$risfield}[0];
            delete $entry->{$risfield};
        }
        else
        {
            &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' );
        }
    }
}

sub _join_multiple_field_data
{
#    append RIS fields for multiple fields types, where each field type could have more than
#      one value (i.e. on multiple lines)
# 
#     _join_field_data
#     (
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#         string <RIS fields to parse>, (required)
#         string <destination EPrint field>, (required)
#         string <separator> (NULL allowed)
#     );

    my ( $epdata, $entry, $risfields, $epfield, $separator ) = @_;
    my @values = ();

    foreach my $risfield ( @{$risfields} )
    {
        if ( defined $entry->{$risfield} )
        {
            push @values, join($separator, @{$entry->{$risfield}});
            delete $entry->{$risfield};
        }
    }

    $epdata->{$epfield} = join($separator, @values) if @values > 0;
}

sub _join_field_data
{
#    append RIS fields where field type could have more than one value (i.e. on multiple lines)  
# 
#     _join_field_data
#     (
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#         string or array <RIS fields to parse>, (required)
#         string <destination EPrint field>, (required)
#         string <separator> (NULL allowed)
#     );

    my ( $epdata, $entry, $risfield, $epfield, $separator ) = @_;

    if ( defined $entry->{$risfield} )
    {
        $epdata->{$epfield} = join($separator, @{$entry->{$risfield}});
        delete $entry->{$risfield};
    }
}

sub _push_array_field_data
{
#    append RIS fields where field type could have more than one value (i.e. on multiple lines)  
# 
#     _join_field_data
#     (
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#         string or array <RIS fields to parse>, (required)
#         string <destination EPrint field>, (required)
#         string <separator> (NULL allowed)
#     );

    my ( $epdata, $entry, $risfield, $epfield ) = @_;

    foreach my $risstring ( @{$entry->{$risfield}} )
    {
        push @{$epdata->{$epfield}}, $risstring;
        delete $entry->{$risfield};
    }
}

sub _process_names
{
#     names get converted differently depending on the publication type
#
#     _process_names
#     (
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#         string <document type> (required)
#     );

    my ( $epdata, $entry, $type ) = @_;

    # Primary authors - catch reviewers first
    if ( grep /$type/, ('REVIEW') )
    {
        &_names($epdata, $entry, ['C5'], 'creators');
        &_names($epdata, $entry, ['AU','A1'], 'ri_creator');
    }
    else
    {
        &_names($epdata, $entry, ['AU','A1'], 'creators');
    }

    # secondary/tertiary authors
    if ( grep /$type/, ('BILL','CONF') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'sponsor');
    }
    elsif ( grep /$type/, ('ADVS','SLIDE','SOUND','VIDEO') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'performer');
    }
    elsif ( grep /$type/, ('BLOG') )
    {
        &_names($epdata, $entry, ['A3'], 'contributors', 'illustrator');
    }
    elsif ( grep /$type/, ('CASE') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'reporter');
        &_names($epdata, $entry, ['A3','A4'], 'contributors', 'other');
    }
    elsif ( grep /$type/, ('THES') )
    {
        &_names($epdata, $entry, ['A3'], 'contributors', 'other');
    }
    elsif ( grep /$type/, ('DATA','MUSIC') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'producer');
    }
    elsif ( grep /$type/, ('MPCT') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'director');
        &_names($epdata, $entry, ['A3'], 'contributors', 'producer');
        &_names($epdata, $entry, ['A4'], 'contributors', 'performer');
    }
    elsif ( grep /$type/, ('PCOMM','ICOMM') )
    {
        &_names($epdata, $entry, ['A2'], 'contributors', 'receipient');
    }
    else
    {
        &_names($epdata, $entry, ['A2','A3'], 'editors');
        &_names($epdata, $entry, ['A4'], 'contributors', 'translator');
    }
}

sub _names
{
#     _names(
#         array ref <eprint data>, (required)
#         array ref <RIS data>, (required)
#         array ref <name fields to parse>, (required)
#         string <destination EPrint field>, (required)
#         string <contributor type> (NULL allowed)
#     );

    my ( $epdata, $entry, $risfields, $epfield, $contributor_type ) = @_;

    # list of contributor types taken from the EPrints contributor_type namedset
    my $contributor_types =
    {
        'translator' => 'http://www.loc.gov/loc.terms/relators/TRL',
        'performer' => 'http://www.loc.gov/loc.terms/relators/PRF',
        'Reporter' => 'http://www.loc.gov/loc.terms/relators/RPT',
        'Sponsor' => 'http://www.loc.gov/loc.terms/relators/SPN',
        'Other' => 'http://www.loc.gov/loc.terms/relators/OTH',
        'Producer' => 'http://www.loc.gov/loc.terms/relators/PRO',
        'Director' => 'http://www.loc.gov/loc.terms/relators/DRT',
        'Recipient' => 'http://www.loc.gov/loc.terms/relators/RCP',
    };
    my @names = ();

    foreach my $risfield ( @{$risfields} )
    {
        foreach my $risstring ( @{ $entry->{$risfield} } )
        {
           if ( $risstring !~ m/,/ )
            {
                # Corporate bodies can be authors.  This is a crude test: if no comma then assume is
                # corporation (not accurate as could match creators with a single name, or
                # ignore corporations with comma in their name).  This is best guess.
                push @{$epdata->{corp_creators}}, $risstring;
            }
            else
            {
                my $name = {};
                next unless my ( $family, $given, $lineage ) = split(/,/, $risstring);
                $name->{name} = { family => $family, given => $given, lineage => $lineage };
                if ( defined($contributor_type) && $epfield eq 'contributor' )
                {
                    $name->{type} = $contributor_types->{$contributor_type};
                }
                push @names, $name;
            }
        }
        delete $entry->{$risfield};
    }

    if ( @names > 0 )
    {
        push @{$epdata->{$epfield}}, @names;
    }
}


1;

=head1 COPYRIGHT

=for COPYRIGHT BEGIN

Copyright 2000-2011 University of Southampton.

=for COPYRIGHT END

=for LICENSE BEGIN

This file is part of EPrints L<http://www.eprints.org/>.

EPrints is free software: you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

EPrints is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.

You should have received a copy of the GNU Lesser General Public
License along with EPrints.  If not, see L<http://www.gnu.org/licenses/>.

=for LICENSE END