diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb index cba95b0c3..7367b9154 100644 --- a/app/views/layouts/application.html.erb +++ b/app/views/layouts/application.html.erb @@ -3,7 +3,7 @@ <%= render 'layouts/head' %> - <%= 'header-notice-present' if TeSS::Config.header_notice&.strip.present? %>> + <%= render partial: 'layouts/header' %>
diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 11d5a2151..b62900ec1 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -11,6 +11,7 @@ def self.ingestors Ingestors::MaterialCsvIngestor, Ingestors::TessEventIngestor, Ingestors::ZenodoIngestor, + Ingestors::OaiPmhIngestor, Ingestors::GithubIngestor, ] + taxila_ingestors + llm_ingestors end diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb new file mode 100644 index 000000000..20e9998fc --- /dev/null +++ b/lib/ingestors/oai_pmh_ingestor.rb @@ -0,0 +1,208 @@ +require 'open-uri' +require 'tess_rdf_extractors' + +module Ingestors + class OaiPmhIngestor < Ingestor + def self.config + { + key: 'oai_pmh', + title: 'OAI-PMH', + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0', + mail: Rails.configuration.tess['contact_email'] + } + end + + def initialize + super + + # to use some helper functions that are instance level methods of BioschemasIngestor + @bioschemas_manager = BioschemasIngestor.new + end + + def read(source_url) + client = OAI::Client.new source_url, headers: { 'From' => config[:mail] } + found_bioschemas = begin + read_oai_rdf(client) + rescue OAI::ArgumentException + false + end + + read_oai_dublin_core(client) unless found_bioschemas + end + + def ns + { + 'dc' => 'http://purl.org/dc/elements/1.1/', + 'oai_dc' => 'http://www.openarchives.org/OAI/2.0/oai_dc/' + } + end + + def read_oai_dublin_core(client) + count = 0 + client.list_records.full.each do |record| + xml_string = record.metadata.to_s + doc = Nokogiri::XML(xml_string) + + types = doc.xpath('//dc:type', ns).map(&:text) + # this event detection heuristic captures in particular + # - http://purl.org/dc/dcmitype/Event (the standard way of typing an event in dublin core) + # - https://schema.org/Event + if types.any? { |t| t.downcase.include? 'event' } + read_dublin_core_event(doc) + else + read_dublin_core_material(doc) + end + + count += 1 + end + @messages << "found #{count} records" + end + + def read_dublin_core_material(xml_doc) + material = OpenStruct.new + material.title = xml_doc.at_xpath('//dc:title', ns)&.text + material.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) + material.authors = xml_doc.xpath('//dc:creator', ns).map(&:text) + material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text) + + rights = xml_doc.xpath('//dc:rights', ns).map { |n| n.text&.strip }.reject(&:empty?) + material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified' + + dates = xml_doc.xpath('//dc:date', ns).map(&:text) + parsed_dates = dates.map do |d| + Date.parse(d) + rescue StandardError + nil + end.compact + material.date_created = parsed_dates.first + material.date_modified = parsed_dates.last if parsed_dates.size > 1 + + identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text) + doi = identifiers.find { |id| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') } + if doi + doi = doi&.sub(%r{https?://doi\.org/}, '') + material.doi = "https://doi.org/#{doi}" + end + material.url = identifiers.find { |id| id.start_with?('http://', 'https://') } + + material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) + material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text) + material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text + + add_material material + end + + def read_dublin_core_event(xml_doc) + event = OpenStruct.new + + event.title = xml_doc.at_xpath('//dc:title', ns)&.text + event.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text) + event.url = xml_doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') } + event.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text + event.organizer = xml_doc.at_xpath('//dc:creator', ns)&.text + event.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text) + event.event_types = xml_doc.xpath('//dc:type', ns).map(&:text) + + dates = xml_doc.xpath('//dc:date', ns).map(&:text) + parsed_dates = dates.map do |d| + Date.parse(d) + rescue StandardError + nil + end.compact + event.start = parsed_dates.first + event.end = parsed_dates.last + + add_event event + end + + def read_oai_rdf(client) + provider_events = [] + provider_materials = [] + totals = Hash.new(0) + + client.list_records(metadata_prefix: 'rdf').full.each do |record| + metadata_tag = Nokogiri::XML(record.metadata.to_s) + bioschemas_xml = metadata_tag.at_xpath('metadata/rdf:RDF', 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')&.to_s + output = parse_bioschemas(bioschemas_xml) + next unless output + + provider_events += output[:resources][:events] + provider_materials += output[:resources][:materials] + output[:totals].each do |key, value| + totals[key] += value + end + end + + if totals.keys.any? + bioschemas_summary = "Bioschemas summary:\n" + totals.each do |type, count| + bioschemas_summary << "\n - #{type}: #{count}" + end + @messages << bioschemas_summary + end + + @bioschemas_manager.deduplicate(provider_events).each do |event_params| + add_event(event_params) + end + + @bioschemas_manager.deduplicate(provider_materials).each do |material_params| + add_material(material_params) + end + + provider_events.any? || provider_materials.any? + end + + def parse_bioschemas(content) + output = { + resources: { + events: [], + materials: [] + }, + totals: Hash.new(0) + } + + return output unless content + + begin + events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |p| + @bioschemas_manager.convert_params(p) + end + courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |p| + @bioschemas_manager.convert_params(p) + end + course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |p| + @bioschemas_manager.convert_params(p) + end + learning_resources = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |p| + @bioschemas_manager.convert_params(p) + end + output[:totals]['Events'] += events.count + output[:totals]['Courses'] += courses.count + output[:totals]['CourseInstances'] += course_instances.count + output[:totals]['LearningResources'] += learning_resources.count + + @bioschemas_manager.deduplicate(events + courses + course_instances).each do |event| + output[:resources][:events] << event + end + + @bioschemas_manager.deduplicate(learning_resources).each do |material| + output[:resources][:materials] << material + end + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any? + error = 'An error' + comment = nil + if e.is_a?(RDF::ReaderError) + error = 'A parsing error' + comment = 'Please check your page contains valid RDF/XML.' + end + message = "#{error} occurred while reading the source." + message << " #{comment}" if comment + @messages << message + end + + output + end + end +end diff --git a/test/unit/ingestors/oai_pmh_test.rb b/test/unit/ingestors/oai_pmh_test.rb new file mode 100644 index 000000000..45c22bea8 --- /dev/null +++ b/test/unit/ingestors/oai_pmh_test.rb @@ -0,0 +1,202 @@ +require 'test_helper' + +class FakeClient + def initialize(rdf_strings, dc_strings) + @rdf_response = Minitest::Mock.new + rdf_response = rdf_strings.map do |s| + inner_mock = Minitest::Mock.new + outer_mock = Minitest::Mock.new + inner_mock.expect(:metadata, outer_mock, []) + outer_mock.expect(:to_s, s, []) + inner_mock + end + dc_response = dc_strings.map do |s| + inner_mock = Minitest::Mock.new + outer_mock = Minitest::Mock.new + inner_mock.expect(:metadata, outer_mock, []) + outer_mock.expect(:to_s, s, []) + inner_mock + end + @rdf_response.expect(:full, rdf_response, []) + @dc_response = Minitest::Mock.new + @dc_response.expect(:full, dc_response, []) + end + + def list_records(metadata_prefix: nil) + if metadata_prefix == 'rdf' + @rdf_response + else + @dc_response + end + end +end + +class OaiPmhTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::OaiPmhIngestor.new + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + end + + test 'should read empty oai pmh endpoint' do + OAI::Client.stub(:new, FakeClient.new([], [])) do + @ingestor.read('https://example.org') + end + assert_equal @ingestor.materials, [] + assert_equal @ingestor.events, [] + end + + test 'should read dublin core material' do + record = <<~METADATA + + + dc_title + dc_description <b>bold_text</b> + A, Alice + B, Bob + + public access + https://opensource.org/licenses/MIT + 2023-06-26 + 2026-06-26 + https://rodare.hzdr.de/record/2513 + 10.14278/rodare.2269 + kA + kB + kC + + + METADATA + + OAI::Client.stub(:new, FakeClient.new([], [record])) do + @ingestor.read('https://example.org') + end + result = @ingestor.materials.first + + assert_equal 'dc_title', result.title + assert_equal 'dc\\_description **bold\\_text**', result.description + assert_equal ['A, Alice', 'B, Bob'], result.authors + assert_equal 'https://opensource.org/licenses/MIT', result.licence + assert_equal Date.parse('2023-06-26'), result.date_created + assert_equal Date.parse('2026-06-26'), result.date_modified + assert_equal 'https://doi.org/10.14278/rodare.2269', result.doi + assert_equal 'https://rodare.hzdr.de/record/2513', result.url + assert_equal %w[kA kB kC], result.keywords + end + + test 'should read dublin core event' do + record = <<~METADATA + + + http://purl.org/dc/dcmitype/Event + dc_title + dc_description <b>bold_text</b> + https://example.org/dc_url + A, Alice + B, Bob + kA + kB + kC + 2026-01-01 + 2026-01-02 + + + METADATA + + OAI::Client.stub(:new, FakeClient.new([], [record])) do + @ingestor.read('https://example.org') + end + result = @ingestor.events.first + + assert_equal 'dc_title', result.title + assert_equal 'dc\\_description **bold\\_text**', result.description + assert_equal 'https://example.org/dc_url', result.url + assert_equal 'A, Alice', result.organizer + assert_equal %w[kA kB kC], result.keywords + assert_equal Date.parse('2026-01-01'), result.start + assert_equal Date.parse('2026-01-02'), result.end + end + + test 'should read multiple dublin core events and materials' do + event1 = <<~METADATA + + + http://purl.org/dc/dcmitype/Event + title1 + + + METADATA + + event2 = <<~METADATA + + + http://purl.org/dc/dcmitype/Event + title2 + + + METADATA + + material1 = <<~METADATA + + + title3 + + + METADATA + + material2 = <<~METADATA + + + title4 + + + METADATA + + OAI::Client.stub(:new, FakeClient.new([], [material1, material2, event1, event2])) do + @ingestor.read('https://example.org') + end + + assert_equal %w[title1 title2], @ingestor.events.map(&:title) + assert_equal %w[title3 title4], @ingestor.materials.map(&:title) + end + + test 'should read bioschemas' do + material = <<~METADATA + + + + + + + bioschemas title + + + + + METADATA + + event = <<~METADATA + + + bioschemas title2 + + + + METADATA + + OAI::Client.stub(:new, FakeClient.new([material, material, event], [])) do + @ingestor.read('https://example.org') + end + + assert_equal 1, @ingestor.materials.length + result = @ingestor.materials.first + assert_equal 'bioschemas title', result.title + assert_equal 'https://example.org/bioschemas/material', result.url + assert_equal 'https://opensource.org/licenses/MIT', result.licence + + assert_equal 1, @ingestor.events.length + result = @ingestor.events.first + assert_equal 'bioschemas title2', result.title + assert_equal 'https://example.org/bioschemas/event', result.url + end +end