=head1 NAME EPrints::Plugin::Import::RIS Author: Stewart Brownrigg, University of Kent, 10 Feb 2012 =cut package EPrints::Plugin::Import::RIS; use EPrints::Plugin::Import::TextFile; use strict; use Data::Dumper; our @ISA = ('EPrints::Plugin::Import::TextFile'); sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new( %params ); $self->{name} = 'Refworks/ReferenceManger (RIS format)'; $self->{visible} = 'all'; $self->{produce} = [ 'list/eprint' ]; return $self; } sub input_fh { my( $plugin, %opts ) = @_; my @ids; my $fh = $opts{fh}; # File handle my @file = <$fh>; my ( %record, @records ) = (); my $lastkey = undef; foreach my $row (@file) # read in records, one row at a time { $row =~ tr/\x{feff}//d; # remove BOM - only appears at beginning of file (find a better way to remove if ( my %row = split(/\s\s-\s/, $row) ) # split the row into field code => data on ' - ', discard any other { for my $key ( keys %row ) { if ( $key eq 'ER' ) # End of Record marker 'ER - '. At this point we have a complete RIS record - now we can create an eprint record { my $epdata = $plugin->convert_input(\%record); # Convert the RIS record to an EPrint record # print Dumper \%record; %record = (); # Clear %record for next publication $lastkey = undef; next unless defined $epdata; # print Dumper $epdata; my $dataobj = $plugin->epdata_to_dataobj($opts{dataset},$epdata); if( defined $dataobj ) { push @ids, $dataobj->get_id; } } elsif ( $key =~ /^[0-9a-z]{2}$/i && defined $row{$key} ) # a valid field has a two character key and defined value { $row{$key} =~ s/[\r\n]$//g; # remove any trailing eol characters push @{$record{$key}}, $row{$key}; # all values are added to arrays just in case there is more than one value. $lastkey = $key; } elsif ( $key != '' ) # catch multi line (wrapped) fields { push @{$record{$lastkey}}, $key; } } } } return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids=>\@ids ); } sub convert_input { my ( $plugin, $entry ) = @_; my ( $epdata ) = (); my ( $unmapped ) = []; my $eptypes = { # hash table for mapping RIS types to EPrint types 'ABST' => 'article', 'EJOUR' => 'article', 'JFULL' => 'article', 'JOUR' => 'article', 'MAG' => 'article', 'MGZN' => 'article', 'NEWS' => 'article', 'BOOK' => 'book', 'EBOOK' => 'book', 'EDBOOK' => 'edbook', 'CONF' => 'conference_item', 'CPAPER' => 'conference_paper', 'CHAP' => 'book_section', 'DATA' => 'dataset', 'AGGR' => 'dataset', 'DBASE' => 'dataset', 'RPRT' => 'research_report', 'PAT' => 'patent', 'THES' => 'thesis', 'SLIDE' => 'image', 'MUSIC' => 'audio', 'SOUND' => 'audio', 'VIDEO' => 'video', 'BLOG' => 'internet', 'MULTI' => 'internet', 'ELEC' => 'internet', 'PAMP' => 'monograph', 'PAT' => 'patent', 'INPR' => 'other', 'UNPB' => 'other', 'GEN' => 'other' }; foreach my $type ( @{ $entry->{TY} } ) { if ( !defined $eptypes->{$type} ) { $type = 'GEN'; } $epdata->{type} = $eptypes->{$type}; # map RIS types to EPrint types # Process reviews first if ( defined $entry->{RI} && defined $entry->{C4} && !grep /$type/, ('MPCT','GRANT') ) { $epdata->{type} = 'review'; $type = 'REVIEW'; } # # Process other type-dependent fields # Add exceptions to the norm here # General catch-all rules are declared outside this look # # Process authors, creators, editors, etc. Mapping varies according to type &_process_names($epdata, $entry, $type); # Date type/publication status - published, submitted or completion (unused) if ( grep /$type/, ('UNPB','INPR','THES','RPRT') ) { $epdata->{date_type} = 'submitted'; if ( $type eq 'INPR' ) { $epdata->{ispublished} = 'inpress'; } elsif ( $type eq 'UNPB' ) { $epdata->{ispublished} = 'unpub'; } else { $epdata->{ispublished} = 'submitted'; } } else { $epdata->{date_type} = 'published'; $epdata->{ispublished} = 'pub'; } # CY - Place of publication / Location of conference (event) if ( grep /$type/, ('CPAPER','CONF') ) { &_join_field_data($epdata, $entry, 'CY', 'event_location', ', '); } # IS - Number of volumes CHAP otherwise Issue if ( grep /$type/, ('CHAP') ) { &_join_field_data($epdata, $entry, 'IS', 'num_pieces', ', '); } # SN - ISSN if ( grep /$type/, ('ABST','INPR','JFULL','JOUR','AGGR','DATA', 'EJOUR','MGZN', 'MUSIC','NEWS' ) ) { &_join_field_data($epdata, $entry, 'SN', 'issn', '; '); } # VL - Volume / Other if ( grep /$type/, ('BLOG','THES') ) { &_store_unmapped($epdata, $entry, 'VL', $unmapped ); } elsif ( grep /$type/, ('CHART','EQUA','FIGURE') ) { &_join_field_data($epdata, $entry, 'VL', 'size', ', '); &_store_unmapped($epdata, $entry, 'A2', $unmapped, 'Field not mapped to EPrints' ); } # T2 - Book/Volume title / Series title / Publication / Conference name if ( grep /$type/, ('CPAPER','CONF','HEAR','UNPB' ) ) { &_push_array_field_data($epdata, $entry, 'T2', 'event_title'); } elsif ( grep /$type/, ('CHAP','ECHAP','ENCYC','EQUA','FIGURE','MUSIC') ) { &_join_field_data($epdata, $entry, 'T2', 'book_title', ', '); } elsif ( grep /$type/, ('BOOK', 'CTLG','CLSWK','COMP','MPCT','MAP','UNPB','ELEC') ) { &_join_field_data($epdata, $entry, 'T2', 'series', ', '); } # THES - type specific if ( $type eq 'THES' ) { &_join_field_data($epdata, $entry, 'T2', 'department'); &_join_field_data($epdata, $entry, 'PB', 'institution'); } # T3 - Tertiary title / Series title / Corporation if ( grep /$type/, ('BILL','BLOG','HEAR','UNPB' ) ) { &_push_array_field_data($epdata, $entry, 'T3', 'corp_creators'); } delete $entry->{TY}; } # The rest: not type dependent, or left over after picking out specific cases above # Title &_join_multiple_field_data($epdata, $entry, ['T1','TI'], 'title'); # Publication title &_join_multiple_field_data($epdata, $entry, ['T2', 'JF'], 'publication', ', '); # Series title &_join_field_data($epdata, $entry, 'T3', 'series', ', '); # Abstract &_join_multiple_field_data($epdata, $entry, ['AB','N2'], 'abstract'); # Caption &_join_field_data($epdata, $entry, 'CA', 'commentary', ', '); # source repository id of bib information &_join_field_data($epdata, $entry, 'ID', 'original_repository_id'); # source repository of bib information &_join_field_data($epdata, $entry, 'NV', 'num_pieces'); # NV - Number of volumes &_join_field_data($epdata, $entry, 'SO', 'original_repository'); # keywords &_join_field_data($epdata, $entry, 'KW', 'keywords', ', '); # CY - Place of publication &_join_field_data($epdata, $entry, 'CY', 'place_of_pub', ', '); # DOI / NIHMSID / CFDA / PMCID &_return_first_value($epdata, $entry, ['DO','C7','C6'], 'id_number', $unmapped); # SN - ISBN (ISSN are caught earlier &_join_field_data($epdata, $entry, 'SN', 'isbn', '; '); # UR - URL &_process_urls($epdata, $entry); # PB - Publisher &_join_field_data($epdata, $entry, 'PB', 'publisher', ', '); # M1/IS - Issue number &_join_multiple_field_data($epdata, $entry, ['IS','M1'], 'number', ', '); # VL - Volume numbering &_join_field_data($epdata, $entry, 'VL', 'volume', ', '); # RI - Reviewed item &_join_field_data($epdata, $entry, 'RI', 'reviewed_item', ', '); # SP/EP - Pages & pagerange. Pages can be in format: # SP - [start page]-[end page] # or SP - [start page]; EP - [end page] my $sp = defined $entry->{SP} ? join('', @{$entry->{SP}}) : undef; delete $entry->{SP}; my $ep = defined $entry->{EP} ? join('', @{$entry->{EP}}) : undef; delete $entry->{EP}; if ( $sp =~ /^[0-9]*-[0-9]*$/ ) { $epdata->{pagerange} = $sp; my ($start_page, $end_page) = split(/-/, $sp); $epdata->{pages} = ($end_page - $start_page) + 1; } elsif ( defined $ep ) { $epdata->{pagerange} = "$sp-$ep"; $epdata->{pages} = ($ep - $sp) + 1; } elsif ( defined $sp ) { $epdata->{pages} = int $sp; } # Date of publication - Take the first 4 digit match &_process_dates($epdata, $entry, ['PY','Y1','Y2','DA'], $unmapped); # N1 - Notes &_join_field_data($epdata, $entry, 'N1', 'note'); # Process any leftovers and add $unmapped fields to the notes field &_process_unmapped($epdata, $entry, $unmapped); return $epdata; } sub _store_unmapped { my ( $epdata, $entry, $risfield, $unmapped, $reason ) = @_; foreach my $field_value ( @{$entry->{$risfield}} ) { if ( @{$unmapped} == 0 ) { push @{$unmapped}, 'Unmapped bibliographic data:'; } push @{$unmapped}, "$risfield - $field_value [$reason]"; } delete $entry->{$risfield}; } sub _process_unmapped { # append unmapped fields to the notes field my ( $epdata, $entry, $unmapped ) = @_; foreach my $risfield (keys %{$entry}) { if ($risfield =~ /[a-z0-9]/i) { foreach my $risstring (@{$entry->{$risfield}}) { &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'Field not mapped to EPrints' ); } } } if ( @{$unmapped} > 0 ) { my $string = join("\r\n", @{$unmapped}); defined $epdata->{note} ? $epdata->{note} .= "\r\n" . $string : $epdata->{note} = $string; } } sub _process_urls { # If there are multiple URLs then the first one goes in official_url, other into related urls # # _join_field_data( # array ref , (required) # array ref , (required) # ); my ( $epdata, $entry ) = @_; foreach my $url ( @{ $entry->{UR} } ) { if ( defined $epdata->{official_url} ) { push @{$epdata->{related_url}}, { url => $url} ; } else { $epdata->{official_url} = $url; } } delete $entry->{UR}; } sub _process_dates { my ( $epdata, $entry, $risfields, $unmapped ) = @_; foreach my $risfield ( @{$risfields} ) { # continue until we have $epdata->{date_year} - don't bother looping through arrays - get # first date and store the rest foreach my $date_string (@{$entry->{$risfield}}) { if ( !defined $epdata->{date_year} ) { my ($year, $month, $day, $other) = split('/', $date_string); $epdata->{date} = $year if $year =~ /^[0-9]{4}$/; $epdata->{date} .= '-' . $month if $month =~ /^[0-9]{2}$/; $epdata->{date} .= '-' . $day if $day =~ /^[0-9]{2}$/; if ( defined $other ) { &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' ); } } else { &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' ); } } delete $entry->{$risfield}; } } sub _return_first_value { # Take an array of RIS fields and pass back the first value encountered. # Useful for sifting through a prioritised list of fields looking for a single value, where we # are certain that there is little chance of subsequent values - and if there are, of little # value # # _return_first_value # ( # $epdata array ref (required) # $entry array ref (required) # $risfields array ref (required) # $epfield string (required) # $unmapped array ref (required) # ); my ( $epdata, $entry, $risfields, $epfield, $unmapped ) = @_; foreach my $risfield ( @{$risfields} ) { if ( defined $entry->{$risfield} && !defined $epdata->{$epfield} ) { $epdata->{$epfield} = $entry->{$risfield}[0]; delete $entry->{$risfield}; } else { &_store_unmapped( $epdata, $entry, $risfield, $unmapped, 'EPrints field already has value set' ); } } } sub _join_multiple_field_data { # append RIS fields for multiple fields types, where each field type could have more than # one value (i.e. on multiple lines) # # _join_field_data # ( # array ref , (required) # array ref , (required) # string , (required) # string , (required) # string (NULL allowed) # ); my ( $epdata, $entry, $risfields, $epfield, $separator ) = @_; my @values = (); foreach my $risfield ( @{$risfields} ) { if ( defined $entry->{$risfield} ) { push @values, join($separator, @{$entry->{$risfield}}); delete $entry->{$risfield}; } } $epdata->{$epfield} = join($separator, @values) if @values > 0; } sub _join_field_data { # append RIS fields where field type could have more than one value (i.e. on multiple lines) # # _join_field_data # ( # array ref , (required) # array ref , (required) # string or array , (required) # string , (required) # string (NULL allowed) # ); my ( $epdata, $entry, $risfield, $epfield, $separator ) = @_; if ( defined $entry->{$risfield} ) { $epdata->{$epfield} = join($separator, @{$entry->{$risfield}}); delete $entry->{$risfield}; } } sub _push_array_field_data { # append RIS fields where field type could have more than one value (i.e. on multiple lines) # # _join_field_data # ( # array ref , (required) # array ref , (required) # string or array , (required) # string , (required) # string (NULL allowed) # ); my ( $epdata, $entry, $risfield, $epfield ) = @_; foreach my $risstring ( @{$entry->{$risfield}} ) { push @{$epdata->{$epfield}}, $risstring; delete $entry->{$risfield}; } } sub _process_names { # names get converted differently depending on the publication type # # _process_names # ( # array ref , (required) # array ref , (required) # string (required) # ); my ( $epdata, $entry, $type ) = @_; # Primary authors - catch reviewers first if ( grep /$type/, ('REVIEW') ) { &_names($epdata, $entry, ['C5'], 'creators'); &_names($epdata, $entry, ['AU','A1'], 'ri_creator'); } else { &_names($epdata, $entry, ['AU','A1'], 'creators'); } # secondary/tertiary authors if ( grep /$type/, ('BILL','CONF') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'sponsor'); } elsif ( grep /$type/, ('ADVS','SLIDE','SOUND','VIDEO') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'performer'); } elsif ( grep /$type/, ('BLOG') ) { &_names($epdata, $entry, ['A3'], 'contributors', 'illustrator'); } elsif ( grep /$type/, ('CASE') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'reporter'); &_names($epdata, $entry, ['A3','A4'], 'contributors', 'other'); } elsif ( grep /$type/, ('THES') ) { &_names($epdata, $entry, ['A3'], 'contributors', 'other'); } elsif ( grep /$type/, ('DATA','MUSIC') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'producer'); } elsif ( grep /$type/, ('MPCT') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'director'); &_names($epdata, $entry, ['A3'], 'contributors', 'producer'); &_names($epdata, $entry, ['A4'], 'contributors', 'performer'); } elsif ( grep /$type/, ('PCOMM','ICOMM') ) { &_names($epdata, $entry, ['A2'], 'contributors', 'receipient'); } else { &_names($epdata, $entry, ['A2','A3'], 'editors'); &_names($epdata, $entry, ['A4'], 'contributors', 'translator'); } } sub _names { # _names( # array ref , (required) # array ref , (required) # array ref , (required) # string , (required) # string (NULL allowed) # ); my ( $epdata, $entry, $risfields, $epfield, $contributor_type ) = @_; # list of contributor types taken from the EPrints contributor_type namedset my $contributor_types = { 'translator' => 'http://www.loc.gov/loc.terms/relators/TRL', 'performer' => 'http://www.loc.gov/loc.terms/relators/PRF', 'Reporter' => 'http://www.loc.gov/loc.terms/relators/RPT', 'Sponsor' => 'http://www.loc.gov/loc.terms/relators/SPN', 'Other' => 'http://www.loc.gov/loc.terms/relators/OTH', 'Producer' => 'http://www.loc.gov/loc.terms/relators/PRO', 'Director' => 'http://www.loc.gov/loc.terms/relators/DRT', 'Recipient' => 'http://www.loc.gov/loc.terms/relators/RCP', }; my @names = (); foreach my $risfield ( @{$risfields} ) { foreach my $risstring ( @{ $entry->{$risfield} } ) { if ( $risstring !~ m/,/ ) { # Corporate bodies can be authors. This is a crude test: if no comma then assume is # corporation (not accurate as could match creators with a single name, or # ignore corporations with comma in their name). This is best guess. push @{$epdata->{corp_creators}}, $risstring; } else { my $name = {}; next unless my ( $family, $given, $lineage ) = split(/,/, $risstring); $name->{name} = { family => $family, given => $given, lineage => $lineage }; if ( defined($contributor_type) && $epfield eq 'contributor' ) { $name->{type} = $contributor_types->{$contributor_type}; } push @names, $name; } } delete $entry->{$risfield}; } if ( @names > 0 ) { push @{$epdata->{$epfield}}, @names; } } 1; =head1 COPYRIGHT =for COPYRIGHT BEGIN Copyright 2000-2011 University of Southampton. =for COPYRIGHT END =for LICENSE BEGIN This file is part of EPrints L. EPrints is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EPrints is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with EPrints. If not, see L. =for LICENSE END