diff --git a/app/views/layouts/application.html.erb b/app/views/layouts/application.html.erb
index cba95b0c3..7367b9154 100644
--- a/app/views/layouts/application.html.erb
+++ b/app/views/layouts/application.html.erb
@@ -3,7 +3,7 @@
<%= render 'layouts/head' %>
-
<%= 'header-notice-present' if TeSS::Config.header_notice&.strip.present? %>>
+
<%= render partial: 'layouts/header' %>
diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb
index 11d5a2151..b62900ec1 100644
--- a/lib/ingestors/ingestor_factory.rb
+++ b/lib/ingestors/ingestor_factory.rb
@@ -11,6 +11,7 @@ def self.ingestors
Ingestors::MaterialCsvIngestor,
Ingestors::TessEventIngestor,
Ingestors::ZenodoIngestor,
+ Ingestors::OaiPmhIngestor,
Ingestors::GithubIngestor,
] + taxila_ingestors + llm_ingestors
end
diff --git a/lib/ingestors/oai_pmh_ingestor.rb b/lib/ingestors/oai_pmh_ingestor.rb
new file mode 100644
index 000000000..20e9998fc
--- /dev/null
+++ b/lib/ingestors/oai_pmh_ingestor.rb
@@ -0,0 +1,208 @@
+require 'open-uri'
+require 'tess_rdf_extractors'
+
+module Ingestors
+ class OaiPmhIngestor < Ingestor
+ def self.config
+ {
+ key: 'oai_pmh',
+ title: 'OAI-PMH',
+ user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0',
+ mail: Rails.configuration.tess['contact_email']
+ }
+ end
+
+ def initialize
+ super
+
+ # to use some helper functions that are instance level methods of BioschemasIngestor
+ @bioschemas_manager = BioschemasIngestor.new
+ end
+
+ def read(source_url)
+ client = OAI::Client.new source_url, headers: { 'From' => config[:mail] }
+ found_bioschemas = begin
+ read_oai_rdf(client)
+ rescue OAI::ArgumentException
+ false
+ end
+
+ read_oai_dublin_core(client) unless found_bioschemas
+ end
+
+ def ns
+ {
+ 'dc' => 'http://purl.org/dc/elements/1.1/',
+ 'oai_dc' => 'http://www.openarchives.org/OAI/2.0/oai_dc/'
+ }
+ end
+
+ def read_oai_dublin_core(client)
+ count = 0
+ client.list_records.full.each do |record|
+ xml_string = record.metadata.to_s
+ doc = Nokogiri::XML(xml_string)
+
+ types = doc.xpath('//dc:type', ns).map(&:text)
+ # this event detection heuristic captures in particular
+ # - http://purl.org/dc/dcmitype/Event (the standard way of typing an event in dublin core)
+ # - https://schema.org/Event
+ if types.any? { |t| t.downcase.include? 'event' }
+ read_dublin_core_event(doc)
+ else
+ read_dublin_core_material(doc)
+ end
+
+ count += 1
+ end
+ @messages << "found #{count} records"
+ end
+
+ def read_dublin_core_material(xml_doc)
+ material = OpenStruct.new
+ material.title = xml_doc.at_xpath('//dc:title', ns)&.text
+ material.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text)
+ material.authors = xml_doc.xpath('//dc:creator', ns).map(&:text)
+ material.contributors = xml_doc.xpath('//dc:contributor', ns).map(&:text)
+
+ rights = xml_doc.xpath('//dc:rights', ns).map { |n| n.text&.strip }.reject(&:empty?)
+ material.licence = rights.find { |r| r.start_with?('http://', 'https://') } || rights.first || 'notspecified'
+
+ dates = xml_doc.xpath('//dc:date', ns).map(&:text)
+ parsed_dates = dates.map do |d|
+ Date.parse(d)
+ rescue StandardError
+ nil
+ end.compact
+ material.date_created = parsed_dates.first
+ material.date_modified = parsed_dates.last if parsed_dates.size > 1
+
+ identifiers = xml_doc.xpath('//dc:identifier', ns).map(&:text)
+ doi = identifiers.find { |id| id.start_with?('10.') || id.start_with?('https://doi.org/') || id.start_with?('http://doi.org/') }
+ if doi
+ doi = doi&.sub(%r{https?://doi\.org/}, '')
+ material.doi = "https://doi.org/#{doi}"
+ end
+ material.url = identifiers.find { |id| id.start_with?('http://', 'https://') }
+
+ material.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text)
+ material.resource_type = xml_doc.xpath('//dc:type', ns).map(&:text)
+ material.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text
+
+ add_material material
+ end
+
+ def read_dublin_core_event(xml_doc)
+ event = OpenStruct.new
+
+ event.title = xml_doc.at_xpath('//dc:title', ns)&.text
+ event.description = convert_description(xml_doc.at_xpath('//dc:description', ns)&.text)
+ event.url = xml_doc.xpath('//dc:identifier', ns).map(&:text).find { |id| id.start_with?('http://', 'https://') }
+ event.contact = xml_doc.at_xpath('//dc:publisher', ns)&.text
+ event.organizer = xml_doc.at_xpath('//dc:creator', ns)&.text
+ event.keywords = xml_doc.xpath('//dc:subject', ns).map(&:text)
+ event.event_types = xml_doc.xpath('//dc:type', ns).map(&:text)
+
+ dates = xml_doc.xpath('//dc:date', ns).map(&:text)
+ parsed_dates = dates.map do |d|
+ Date.parse(d)
+ rescue StandardError
+ nil
+ end.compact
+ event.start = parsed_dates.first
+ event.end = parsed_dates.last
+
+ add_event event
+ end
+
+ def read_oai_rdf(client)
+ provider_events = []
+ provider_materials = []
+ totals = Hash.new(0)
+
+ client.list_records(metadata_prefix: 'rdf').full.each do |record|
+ metadata_tag = Nokogiri::XML(record.metadata.to_s)
+ bioschemas_xml = metadata_tag.at_xpath('metadata/rdf:RDF', 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')&.to_s
+ output = parse_bioschemas(bioschemas_xml)
+ next unless output
+
+ provider_events += output[:resources][:events]
+ provider_materials += output[:resources][:materials]
+ output[:totals].each do |key, value|
+ totals[key] += value
+ end
+ end
+
+ if totals.keys.any?
+ bioschemas_summary = "Bioschemas summary:\n"
+ totals.each do |type, count|
+ bioschemas_summary << "\n - #{type}: #{count}"
+ end
+ @messages << bioschemas_summary
+ end
+
+ @bioschemas_manager.deduplicate(provider_events).each do |event_params|
+ add_event(event_params)
+ end
+
+ @bioschemas_manager.deduplicate(provider_materials).each do |material_params|
+ add_material(material_params)
+ end
+
+ provider_events.any? || provider_materials.any?
+ end
+
+ def parse_bioschemas(content)
+ output = {
+ resources: {
+ events: [],
+ materials: []
+ },
+ totals: Hash.new(0)
+ }
+
+ return output unless content
+
+ begin
+ events = Tess::Rdf::EventExtractor.new(content, :rdfxml).extract do |p|
+ @bioschemas_manager.convert_params(p)
+ end
+ courses = Tess::Rdf::CourseExtractor.new(content, :rdfxml).extract do |p|
+ @bioschemas_manager.convert_params(p)
+ end
+ course_instances = Tess::Rdf::CourseInstanceExtractor.new(content, :rdfxml).extract do |p|
+ @bioschemas_manager.convert_params(p)
+ end
+ learning_resources = Tess::Rdf::LearningResourceExtractor.new(content, :rdfxml).extract do |p|
+ @bioschemas_manager.convert_params(p)
+ end
+ output[:totals]['Events'] += events.count
+ output[:totals]['Courses'] += courses.count
+ output[:totals]['CourseInstances'] += course_instances.count
+ output[:totals]['LearningResources'] += learning_resources.count
+
+ @bioschemas_manager.deduplicate(events + courses + course_instances).each do |event|
+ output[:resources][:events] << event
+ end
+
+ @bioschemas_manager.deduplicate(learning_resources).each do |material|
+ output[:resources][:materials] << material
+ end
+ rescue StandardError => e
+ Rails.logger.error("#{e.class}: #{e.message}")
+ Rails.logger.error(e.backtrace.join("\n")) if e.backtrace&.any?
+ error = 'An error'
+ comment = nil
+ if e.is_a?(RDF::ReaderError)
+ error = 'A parsing error'
+ comment = 'Please check your page contains valid RDF/XML.'
+ end
+ message = "#{error} occurred while reading the source."
+ message << " #{comment}" if comment
+ @messages << message
+ end
+
+ output
+ end
+ end
+end
diff --git a/test/unit/ingestors/oai_pmh_test.rb b/test/unit/ingestors/oai_pmh_test.rb
new file mode 100644
index 000000000..45c22bea8
--- /dev/null
+++ b/test/unit/ingestors/oai_pmh_test.rb
@@ -0,0 +1,202 @@
+require 'test_helper'
+
+class FakeClient
+ def initialize(rdf_strings, dc_strings)
+ @rdf_response = Minitest::Mock.new
+ rdf_response = rdf_strings.map do |s|
+ inner_mock = Minitest::Mock.new
+ outer_mock = Minitest::Mock.new
+ inner_mock.expect(:metadata, outer_mock, [])
+ outer_mock.expect(:to_s, s, [])
+ inner_mock
+ end
+ dc_response = dc_strings.map do |s|
+ inner_mock = Minitest::Mock.new
+ outer_mock = Minitest::Mock.new
+ inner_mock.expect(:metadata, outer_mock, [])
+ outer_mock.expect(:to_s, s, [])
+ inner_mock
+ end
+ @rdf_response.expect(:full, rdf_response, [])
+ @dc_response = Minitest::Mock.new
+ @dc_response.expect(:full, dc_response, [])
+ end
+
+ def list_records(metadata_prefix: nil)
+ if metadata_prefix == 'rdf'
+ @rdf_response
+ else
+ @dc_response
+ end
+ end
+end
+
+class OaiPmhTest < ActiveSupport::TestCase
+ setup do
+ @ingestor = Ingestors::OaiPmhIngestor.new
+ @user = users(:regular_user)
+ @content_provider = content_providers(:another_portal_provider)
+ end
+
+ test 'should read empty oai pmh endpoint' do
+ OAI::Client.stub(:new, FakeClient.new([], [])) do
+ @ingestor.read('https://example.org')
+ end
+ assert_equal @ingestor.materials, []
+ assert_equal @ingestor.events, []
+ end
+
+ test 'should read dublin core material' do
+ record = <<~METADATA
+
+
+ dc_title
+ dc_description <b>bold_text</b>
+ A, Alice
+ B, Bob
+
+ public access
+ https://opensource.org/licenses/MIT
+ 2023-06-26
+ 2026-06-26
+ https://rodare.hzdr.de/record/2513
+ 10.14278/rodare.2269
+ kA
+ kB
+ kC
+
+
+ METADATA
+
+ OAI::Client.stub(:new, FakeClient.new([], [record])) do
+ @ingestor.read('https://example.org')
+ end
+ result = @ingestor.materials.first
+
+ assert_equal 'dc_title', result.title
+ assert_equal 'dc\\_description **bold\\_text**', result.description
+ assert_equal ['A, Alice', 'B, Bob'], result.authors
+ assert_equal 'https://opensource.org/licenses/MIT', result.licence
+ assert_equal Date.parse('2023-06-26'), result.date_created
+ assert_equal Date.parse('2026-06-26'), result.date_modified
+ assert_equal 'https://doi.org/10.14278/rodare.2269', result.doi
+ assert_equal 'https://rodare.hzdr.de/record/2513', result.url
+ assert_equal %w[kA kB kC], result.keywords
+ end
+
+ test 'should read dublin core event' do
+ record = <<~METADATA
+
+
+ http://purl.org/dc/dcmitype/Event
+ dc_title
+ dc_description <b>bold_text</b>
+ https://example.org/dc_url
+ A, Alice
+ B, Bob
+ kA
+ kB
+ kC
+ 2026-01-01
+ 2026-01-02
+
+
+ METADATA
+
+ OAI::Client.stub(:new, FakeClient.new([], [record])) do
+ @ingestor.read('https://example.org')
+ end
+ result = @ingestor.events.first
+
+ assert_equal 'dc_title', result.title
+ assert_equal 'dc\\_description **bold\\_text**', result.description
+ assert_equal 'https://example.org/dc_url', result.url
+ assert_equal 'A, Alice', result.organizer
+ assert_equal %w[kA kB kC], result.keywords
+ assert_equal Date.parse('2026-01-01'), result.start
+ assert_equal Date.parse('2026-01-02'), result.end
+ end
+
+ test 'should read multiple dublin core events and materials' do
+ event1 = <<~METADATA
+
+
+ http://purl.org/dc/dcmitype/Event
+ title1
+
+
+ METADATA
+
+ event2 = <<~METADATA
+
+
+ http://purl.org/dc/dcmitype/Event
+ title2
+
+
+ METADATA
+
+ material1 = <<~METADATA
+
+
+ title3
+
+
+ METADATA
+
+ material2 = <<~METADATA
+
+
+ title4
+
+
+ METADATA
+
+ OAI::Client.stub(:new, FakeClient.new([], [material1, material2, event1, event2])) do
+ @ingestor.read('https://example.org')
+ end
+
+ assert_equal %w[title1 title2], @ingestor.events.map(&:title)
+ assert_equal %w[title3 title4], @ingestor.materials.map(&:title)
+ end
+
+ test 'should read bioschemas' do
+ material = <<~METADATA
+
+
+
+
+
+
+ bioschemas title
+
+
+
+
+ METADATA
+
+ event = <<~METADATA
+
+
+ bioschemas title2
+
+
+
+ METADATA
+
+ OAI::Client.stub(:new, FakeClient.new([material, material, event], [])) do
+ @ingestor.read('https://example.org')
+ end
+
+ assert_equal 1, @ingestor.materials.length
+ result = @ingestor.materials.first
+ assert_equal 'bioschemas title', result.title
+ assert_equal 'https://example.org/bioschemas/material', result.url
+ assert_equal 'https://opensource.org/licenses/MIT', result.licence
+
+ assert_equal 1, @ingestor.events.length
+ result = @ingestor.events.first
+ assert_equal 'bioschemas title2', result.title
+ assert_equal 'https://example.org/bioschemas/event', result.url
+ end
+end