=head1 NAME EPrints::Plugin::Import::DOI =cut package EPrints::Plugin::Import::DOI; use strict; use EPrints::Plugin::Import::TextFile; require URI; our @ISA = qw/ EPrints::Plugin::Import::TextFile /; sub new { my( $class, %params ) = @_; my $self = $class->SUPER::new( %params ); $self->{name} = "DOI (via CrossRef)"; $self->{visible} = "all"; $self->{produce} = [ 'dataobj/eprint', 'list/eprint' ]; $self->{screen} = "Import::DOI"; return $self; } sub screen { my( $self, %params ) = @_; return $self->{repository}->plugin( "Screen::Import::DOI", %params ); } sub input_text_fh # cf. http://help.crossref.org/query-results # cf. http://help.crossref.org/unixsd for metadata format # cf. http://www.crossref.org/help/schema_doc/crossref_query_output3.0/query_output3.0.html # cf. http://www.crossref.org/schemas/crossref_query_output3.0.xsd # cf. http://www.crossref.org/help/schema_doc/unixref1.1/unixref1.1.html # cf. http://doi.crossref.org/schemas/unixref1.1.xsd # cf. https://jats.nlm.nih.gov/publishing/1.1/xsd.html { my( $plugin, %opts ) = @_; my @ids; my $pid = $plugin->param( "pid" ); my $session = $plugin->{repository}; my $use_prefix = $plugin->param( "use_prefix" ) || 0; my $doi_field = $plugin->param( "doi_field" ) || 'id_number'; unless( $pid ) { $plugin->handler->message( "error", $plugin->html_phrase( "missing_pid" )); return undef; } my $fh = $opts{fh}; while( my $doi = <$fh> ) # process all available DOIs (one per line) { $doi =~ s/^\s*([^\s]*)\s*$/$1/; # remove surrounding blancs next unless length($doi); # START check and exclude DOI from fetch if DOI already exists somewhere in the 'eprint' dataset - Alan Stiles, Open University, 20140408 # default # my $duplicates = $session->dataset( 'archive' )->search( # modify? my $duplicates = $session->dataset( 'eprint' )->search( filters => [ { meta_fields => [$doi_field], value => $doi, match => "EX" } ] ); if ( $duplicates->count() > 0 ) { $plugin->handler->message( "warning", $plugin->html_phrase( "duplicate_doi", doi => $plugin->{session}->make_text( $doi ), msg => $duplicates->item( 0 )->render_citation_link(), )); next; } # END check and exclude DOI from fetch if DOI already exists in the 'archive' dataset - Alan Stiles, Open University, 20140408 # prepare request for metadata # https://doi.crossref.org/servlet/query?pid=:&id=10.1577/H02-043 # where: # pid is your Crossref-supplied login name and password or Crossref Query Services email address. # format is the desired results format ( xsd_xml | unixref | unixsd) # id is a Crossref DOI # https://doi.crossref.org/search/doi?pid=&format=unixsd&doi= my %params = ( pid => $pid, noredirect => "true", format => "unixsd", id => $doi, ); my $url = URI->new( "http://doi.crossref.org/openurl" ); $url->query_form( %params ); my $dom_doc; eval { $dom_doc = EPrints::XML::parse_url( $url ); }; # evaluate reply my $dom_top = $dom_doc->getDocumentElement if defined $dom_doc; my $dom_result = ($dom_top->getElementsByTagName( "query_result" ))[0] if defined $dom_top; # check for no reply if( $@ || !defined $dom_result) { $plugin->handler->message( "warning", $plugin->html_phrase( "no_reply", doi => $plugin->{session}->make_text( $doi ) )); next; } my $dom_body = ($dom_result->getElementsByTagName( "body" ))[0]; my $dom_query = ($dom_body->getElementsByTagName( "query" ))[0]; my $status = $dom_query->getAttribute( "status" ); # check for bad reply if( defined($status) && $status ne "resolved" ) { my $dom_msg = ($dom_query->getElementsByTagName( "msg" ))[0]; my $msg = ""; $msg = EPrints::Utils::tree_to_utf8( $dom_msg ) if defined $dom_msg; if( $status =~ /^system/ ) { $plugin->handler->message( "warning", $plugin->html_phrase( "unresolved_doi", doi => $plugin->{session}->make_text( $doi ), msg => $plugin->{session}->make_text( $msg ) )); next; } else { $plugin->handler->message( "warning", $plugin->html_phrase( "invalid_doi", doi => $plugin->{session}->make_text( $doi ), msg => $plugin->{session}->make_text( $msg ) )); next; } } # Only include prefix if config parameter set - Alan Stiles, Open University, 20140408 if ( $use_prefix ) { $doi =~ s/^(doi:)?/doi:/i; } else { $doi =~ s/^(doi:)?//i; } my $data = { doi => $doi }; # exploit reply, i.e. fill $data ExploitNode( $dom_body, $data ); EPrints::XML::dispose( $dom_doc ); if( defined $data->{"error"}) { $plugin->handler->message( "error", $plugin->html_phrase( "error", doi => $plugin->{session}->make_text( $doi ), msg => $plugin->{session}->make_text( $data->{"error"} ) )); next; } my $epdata = $plugin->convert_input( $doi_field, $data ); next unless( defined $epdata ); my $dataobj = $plugin->epdata_to_dataobj( $opts{dataset}, $epdata ); push @ids, $dataobj->get_id if defined $dataobj; } return EPrints::List->new( dataset => $opts{dataset}, session => $plugin->{session}, ids=>\@ids ); } =head1 ExploitNode exploits given root immediately, by some helping functions or recursively returns found data =cut sub ExploitNode { my( $root, $data ) = @_; my @skip = ("abstract","archive_locations","citation_list","component_list","crossmark","doi_data","identifier","item_number","noisbn","program","publisher_item","sa_component"); my @descent = ("book","book_metadata","book_series_metadata","book_set_metadata","conference","conference_paper","content_item","crm-item","crossref","crossref_metadata","database","database_date","database_metadata","dataset","dissertation","doi_record","event_metadata","institution","journal","journal_article","journal_issue","journal_metadata","journal_volume","pages","proceedings_metadata","proceedings_series_metadata","publisher","query","report-paper","report-paper_metadata","report-paper_series_metadata","series_metadata","standard","standard_metadata","standards_body","standard_series_metadata","titles"); # Where to put {0,1}, {0,1} or {0,1} # and what about , {0,1}, {0,unbounded} or {0,1} Root: foreach my $node ( $root->getChildNodes ) { next unless ( EPrints::XML::is_dom( $node, "Element" ) ); my $name = $node->tagName; foreach my $not ( @skip ) { next Root if $not eq $name; } foreach my $leaf ( @descent ) { if( $name eq $leaf ) { ExploitNode( $node, $data ); next Root; } } my $AttT = undef; my $AttMT = undef; my $AttN = undef; $AttT = $node->getAttribute( "type" ) if $node->hasAttribute( "type" ); $name .= ".".$AttT if defined $AttT; $AttMT = $node->getAttribute( "media_type" ) if $node->hasAttribute( "media_type" ); $name .= ".".$AttMT if defined $AttMT; $AttN = $node->getAttribute( "name" ) if $node->hasAttribute( "name" ); $name .= ".".$AttN if defined $AttN && ! $AttN =~ /^xref/; if( $name eq "contributors" ) { SetContributors( $data, $node ); } elsif( $name eq "person_name" ) { $data->{"creators"} = SetCreator( $node ); } elsif( $name ne "conference_date" && $name =~ /_date/ ) { $data->{$name} = SetDate( $node ); } else { $data->{$name} = EPrints::Utils::tree_to_utf8( $node ) unless defined($data->{$name}); } } } =head1 SetDate recognizes /^(start,end)?(day|month|year)/ returns string (format (dd-)?(mm-)?yyyy) of date (start/end -> "start - end") =cut sub SetDate { my( $root ) = @_; my $date = {}; foreach my $node ($root->childNodes) { next unless EPrints::XML::is_dom( $node, "Element" ); my $name = $node->nodeName; $name =~ s/^start_//; $date->{$name} = EPrints::Utils::tree_to_utf8( $node ); } my $EndDate = ""; my $StartDate = ""; $StartDate = $date->{"year"} if defined $date->{"year"}; $StartDate = $date->{"month"}."-".$StartDate if defined $date->{"month"} && length ($StartDate); $StartDate = $date->{"day"}."-".$StartDate if defined $date->{"day"} && defined $date->{"month"} && length ($StartDate); $EndDate = $date->{"end_year"} if defined $date->{"end_year"}; $EndDate = $date->{"end_month"}."-".$EndDate if defined $date->{"end_month"} && length ($EndDate); $EndDate = $date->{"end_day"}."-".$EndDate if defined $date->{"end_day"} && defined $date->{"end_month"} && length ($EndDate); if ( !length ($StartDate) || !length ($EndDate)) { $StartDate = $StartDate . $EndDate } elsif ( length ($StartDate) && length ($EndDate)) { $StartDate = $StartDate . " - " . $EndDate } return $StartDate; } =head1 GetDate returns best value for given element name =cut sub GetDate { my( $data, $name ) = @_; foreach my $type ( "", ".print", ".online", ".other" ) { return $data->{$name.$type} if defined $data->{$name.$type} } return undef; } =head1 SetCreator recognizes contributor's childNodes returns map containig all subnames in eprint's format =cut sub SetCreator { my( $contributor ) = @_; my $creator = {}; $creator->{name} = {}; foreach my $part ($contributor->childNodes) { next unless EPrints::XML::is_dom( $part, "Element" ); if( $part->nodeName eq "given_name" ) { $creator->{name}->{given} = EPrints::Utils::tree_to_utf8($part); } elsif( $part->nodeName eq "surname" ) { $creator->{name}->{family} = EPrints::Utils::tree_to_utf8($part); } elsif( $part->nodeName eq "ORCID" ) { $creator->{id} = EPrints::Utils::tree_to_utf8($part); } } return $creator; } =head1 SetContributors recognizes all contributor types by considering contributor's role returns maps of all distinguished contributor types =cut sub SetContributors { my( $data, $node ) = @_; my @corp_creators; my @creators; my @editors; foreach my $contributor ($node->childNodes) # contributor, organization or person_name { next unless EPrints::XML::is_dom( $contributor, "Element" ); my $creator = {}; $creator->{name} = {}; my $role = "author"; if( $contributor->hasAttribute( "contributor_role" ) ) { $role = $contributor->getAttribute( "contributor_role" ); } if( $contributor->tagName eq "organization" ) { push @corp_creators, EPrints::Utils::tree_to_utf8( $contributor ); next; } $creator = SetCreator( $contributor ); if( $role eq "editor" ) { push @editors, $creator if exists $creator->{name}->{family}; } else { push @creators, $creator if exists $creator->{name}->{family}; } } $data->{"corp_creators"} = \@corp_creators if @corp_creators; $data->{"creators"} = \@creators if @creators; $data->{"editors"} = \@editors if @editors; } sub convert_input { my( $plugin, $doi_field, $data ) = @_; my $epdata = {}; my $PagePattern = qr/.*?(\d+)\D*$/; my $ISSNpattern = qr/^(\d{4})(\d{3}[\dX])$/; my $ISBNpatternO = qr/^(\d)(\d{5})(\d{3})([\dX])$/; # cf. https://www.isbn-international.org/sites/default/files/ISBN%20Manual%202012%20-corr.pdf my $ISBNpatternN = qr/^(\d{3})(\d)(\d{4})(\d{4})([\dX])$/; if( defined $data->{"creators"} ) { $epdata->{creators} = $data->{"creators"}; } elsif( defined $data->{"author"} ) { $epdata->{creators} = [ { name=>{ family=>$data->{"author"} }, } ]; } if( defined $data->{"editors"} ) { $epdata->{editors} = $data->{"editors"}; } if( defined $data->{"corp_creators"} ) { $epdata->{corp_creators} = $data->{"corp_creators"}; } $epdata->{corp_creators} = $data->{"standards_body_name"} if defined($data->{"standards_body_name"}); if( defined $data->{"isbn.electronic"} ) { $epdata->{type} = "book"; my $isbn = $data->{"isbn.electronic"}; $isbn =~ s/$ISBNpatternN/$1-$2-$3-$4-$5/; $isbn =~ s/$ISBNpatternO/$1-$2-$3-$4/; # modify? # $epdata->{isbn} = $isbn; # $epdata->{eisbn} = $isbn; } if( defined $data->{"isbn.print"} ) { $epdata->{type} = "book"; my $isbn = $data->{"isbn.print"}; $isbn =~ s/$ISBNpatternN/$1-$2-$3-$4-$5/; $isbn =~ s/$ISBNpatternO/$1-$2-$3-$4/; $epdata->{isbn} = $isbn; } if( defined $data->{"issn.electronic"} ) { my $issn = $data->{"issn.electronic"}; $issn =~ s/$ISSNpattern/$1-$2/; # modify? # $epdata->{issn} = $issn; # $epdata->{eissn} = $issn; } if( defined $data->{"issn.print"} ) { my $issn = $data->{"issn.print"}; $issn =~ s/$ISSNpattern/$1-$2/; $epdata->{issn} = $issn; } if( defined $data->{"doi"} ) { my $doi = $data->{"doi"}; $doi =~ s/^\s*doi:\s*//gi; $epdata->{$doi_field} = $doi; } if( defined $data->{"resource_t"} ) { $epdata->{official_url} = $data->{"resource_t"}; } if( defined $data->{"full_title"} || defined $data->{"journal_title"} || defined $data->{"journal-title"} ) { $epdata->{type} = "article"; $epdata->{publication} = $data->{"full_title"} if defined($data->{"full_title"}); $epdata->{publication} = $data->{"journal-title"} if defined($data->{"journal-title"}); $epdata->{publication} = $data->{"journal_title"} if defined($data->{"journal_title"}); } if( defined $data->{"article_title"} ) { $epdata->{type} = "article"; $epdata->{title} = $data->{"article_title"}; } if( defined $data->{"series_title"} ) { $epdata->{type} = "book"; $epdata->{note} = 'series title: ' . $data->{"series_title"}; } if( defined $data->{"volume_title"} ) { $epdata->{type} = "book"; $epdata->{book_title} = $data->{"volume_title"}; } if( defined $data->{"proceedings_title"} ) { $epdata->{type} = "book_section"; $epdata->{book_title} = $data->{"proceedings_title"}; } $epdata->{date} = GetDate($data, "approval_date"); $epdata->{thesis_type} = $data->{"degree"} if defined $data->{"degree"} && $data->{"degree"} =~ /^(diploma|masters|doctoral|postdoctoral|others)$/; if( defined $data->{"publisher"} ) { $epdata->{publisher} = $data->{"publisher"}; } elsif( defined $data->{"publisher_name"} ) { $epdata->{publisher} = $data->{"publisher_name"}; } elsif( defined $data->{"crm-item.publisher-name"} ) { $epdata->{publisher} = $data->{"crm-item.publisher-name"}; } $epdata->{place_of_pub} = $data->{"publisher_place"} if defined $data->{"publisher_place"}; # $epdata->{place_of_pub} = $data->{"archive_locations"} if defined $data->{"archive_locations"}; $epdata->{place_of_pub} = $data->{"institution_place"} if defined $data->{"institution_place"}; $epdata->{abstract} = $data->{"description"} if defined $data->{"description"}; $epdata->{keywords} = $data->{"proceedings_subject"} if defined $data->{"proceedings_subject"}; $epdata->{volume} = $data->{"volume"} if defined $data->{"volume"}; $epdata->{number} = $data->{"issue"} if defined $data->{"issue"}; $epdata->{pagerange} = $data->{"first_page"}."-" if defined $data->{"first_page"}; if( defined $data->{"last_page"} ) { $epdata->{pagerange} = "-" unless defined $epdata->{pagerange}; $epdata->{pagerange} .= $data->{"last_page"}; } if( defined $data->{"first_page"} && defined $data->{"last_page"} ) { my $first = $data->{"first_page"} =~ s/$PagePattern/$1/r if $data->{"first_page"} =~ /$PagePattern/; my $last = $data->{"last_page"} =~ s/$PagePattern/$1/r if $data->{"last_page"} =~ /$PagePattern/; $epdata->{pages} = $last - $first + 1 if length($first) && length($last); } $epdata->{department} = $data->{"institution_department"} if defined $data->{"institution_department"}; $epdata->{institution} = $data->{"institution_name"} if defined $data->{"institution_name"}; $epdata->{event_title} = $data->{"conference_name"} if defined $data->{"conference_name"}; $epdata->{event_location} = $data->{"conference_location"} if defined $data->{"conference_location"}; $epdata->{event_dates} = $data->{"conference_date"} if defined $data->{"conference_date"}; if( defined $data->{"year"} && $data->{"year"} =~ /^[0-9]{4}$/ ) { $epdata->{date} = $data->{"year"}; # does 'media_type' matter? if( defined $data->{"month"} && $data->{"month"} =~ /^[0-9]{2}$/ ) { $epdata->{date} .= '-' . $data->{"month"}; } } my $date = GetDate($data, "publication_date"); if( defined $date ) { $epdata->{date_type} = "published"; $epdata->{date} = $date; # deliberately overwriting approval_date } $date = GetDate($data, "creation_date"); if( defined $date ) { $epdata->{note} .= ", " if defined $epdata->{note}; $epdata->{note} .= "creation date: ".$date; } if( !defined $epdata->{type} ) { $epdata->{type} = "article" if defined $data->{"doi.journal_article"} || defined $data->{"doi.journal_issue"} || defined $data->{"doi.journal_title"} || defined $data->{"doi.journal_volume"}; $epdata->{type} = "book" if defined $data->{"doi.book_content"} || defined $data->{"doi.book_series"} || defined $data->{"doi.book_title"}; $epdata->{type} = "book" if defined $data->{"component_number"} || defined $data->{"edition_number"}; $epdata->{type} = "conference_item" if defined $data->{"doi.conference_paper"} || defined $data->{"doi.conference_series"} || defined $data->{"doi.conference_title"}; $epdata->{type} = "dataset" if defined $data->{"doi.dataset"}; $epdata->{type} = "monograph" if defined $data->{"doi.report-paper_content"} || defined $data->{"doi.report-paper_series"} || defined $data->{"doi.report-paper_title"}; $epdata->{type} = "thesis" if defined $data->{"doi.dissertation"}; } if( defined $epdata->{type} && $epdata->{type} eq "book" ) { $epdata->{book_title} = $data->{"title"} if defined($data->{"title"}); $epdata->{book_title} .= ": " . $data->{"subtitle"} if defined($data->{"subtitle"}); } else { $epdata->{title} = $data->{"title"} if defined($data->{"title"}); $epdata->{title} .= ": " . $data->{"subtitle"} if defined($data->{"subtitle"}); } return $epdata; } sub url_encode { my ($str) = @_; $str =~ s/([^A-Za-z0-9])/sprintf("%%%02X", ord($1))/seg; return $str; } 1; =head1 COPYRIGHT =for COPYRIGHT BEGIN Copyright 2000-2011 University of Southampton. =for COPYRIGHT END =for LICENSE BEGIN This file is part of EPrints L. EPrints is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. EPrints is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with EPrints. If not, see L. =for LICENSE END