added a base vocabulary

This commit is contained in:
Michele Artini 2024-03-08 11:52:58 +01:00
parent 3b5163d8e3
commit 932173287a
3 changed files with 205 additions and 12 deletions

View File

@ -45,9 +45,9 @@ public class BaseCollectorPlugin implements CollectorPlugin {
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report) throws CollectorException {
// get path to file
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
final String dbUrl = api.getParams().get("dbUrl");
final String dbUser = api.getParams().get("dbUser");
@ -59,7 +59,9 @@ public class BaseCollectorPlugin implements CollectorPlugin {
log.info("dbPassword: {}", "***");
try {
if (!this.fs.exists(filePath)) { throw new CollectorException("path does not exist: " + filePath); }
if (!this.fs.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (final Throwable e) {
throw new CollectorException(e);
}
@ -69,19 +71,20 @@ public class BaseCollectorPlugin implements CollectorPlugin {
final Iterator<String> iterator = new BaseCollectorIterator(this.fs, filePath, report);
final Spliterator<String> spliterator = Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED);
return StreamSupport
.stream(spliterator, false)
.filter(doc -> filterXml(doc, acceptedOpendoarIds, report));
.stream(spliterator, false)
.filter(doc -> filterXml(doc, acceptedOpendoarIds, report));
}
private Set<String> findAcceptedOpendoarIds(final String dbUrl, final String dbUser, final String dbPassword)
throws CollectorException {
throws CollectorException {
final Set<String> accepted = new HashSet<>();
try (final DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
final String sql = IOUtils
.toString(BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
.toString(
BaseAnalyzerJob.class
.getResourceAsStream("/eu/dnetlib/dhp/collection/plugin/base/sql/opendoar-accepted.sql"));
dbClient.processResults(sql, row -> {
try {
@ -106,7 +109,10 @@ public class BaseCollectorPlugin implements CollectorPlugin {
private boolean filterXml(final String xml, final Set<String> acceptedOpendoarIds, final AggregatorReport report) {
try {
final String id = DocumentHelper.parseText(xml).valueOf("//*[local-name()='collection']/@opendoar_id").trim();
final String id = DocumentHelper
.parseText(xml)
.valueOf("//*[local-name()='collection']/@opendoar_id")
.trim();
return (StringUtils.isNotBlank(id) && acceptedOpendoarIds.contains("opendoar____::" + id.trim()));
} catch (final DocumentException e) {
log.error("Error parsing document", e);

View File

@ -0,0 +1,180 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value="" />
<RESOURCE_TYPE value="VocabularyDSResourceType" />
<RESOURCE_KIND value="VocabularyDSResources" />
<RESOURCE_URI value="" />
<DATE_OF_CREATION value="2024-02-13T11:15:48+00:00" />
</HEADER>
<BODY>
<CONFIGURATION>
<VOCABULARY_NAME code="base:normalized_types">base:normalized_types</VOCABULARY_NAME>
<VOCABULARY_DESCRIPTION>base:normalized_types</VOCABULARY_DESCRIPTION>
<TERMS>
<TERM native_name="Text" code="Text" english_name="Text" encoding="BASE">
<SYNONYMS>
<SYNONYM term="1" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Book" code="Book" english_name="Book" encoding="BASE">
<SYNONYMS>
<SYNONYM term="11" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Book part" code="Book part" english_name="Book part" encoding="BASE">
<SYNONYMS>
<SYNONYM term="111" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Journal/Newspaper" code="Journal/Newspaper" english_name="Journal/Newspaper" encoding="BASE">
<SYNONYMS>
<SYNONYM term="12" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Article contribution" code="Article contribution" english_name="Article contribution" encoding="BASE">
<SYNONYMS>
<SYNONYM term="121" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Other non-article" code="Other non-article" english_name="Other non-article" encoding="BASE">
<SYNONYMS>
<SYNONYM term="122" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Conference object" code="Conference object" english_name="Conference object" encoding="BASE">
<SYNONYMS>
<SYNONYM term="13" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Report" code="Report" english_name="Report" encoding="BASE">
<SYNONYMS>
<SYNONYM term="14" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Review" code="Review" english_name="Review" encoding="BASE">
<SYNONYMS>
<SYNONYM term="15" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Course material" code="Course material" english_name="Course material" encoding="BASE">
<SYNONYMS>
<SYNONYM term="16" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Lecture" code="Lecture" english_name="Lecture" encoding="BASE">
<SYNONYMS>
<SYNONYM term="17" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Thesis" code="Thesis" english_name="Thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="18" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Bachelor's thesis" code="Bachelor's thesis" english_name="Bachelor's thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="181" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Master's thesis" code="Master's thesis" english_name="Master's thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="182" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Doctoral and postdoctoral thesis" code="Doctoral and postdoctoral thesis" english_name="Doctoral and postdoctoral thesis" encoding="BASE">
<SYNONYMS>
<SYNONYM term="183" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Manuscript" code="Manuscript" english_name="Manuscript" encoding="BASE">
<SYNONYMS>
<SYNONYM term="19" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Patent" code="Patent" english_name="Patent" encoding="BASE">
<SYNONYMS>
<SYNONYM term="1A" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Musical notation" code="Musical notation" english_name="Musical notation" encoding="BASE">
<SYNONYMS>
<SYNONYM term="2" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Map" code="Map" english_name="Map" encoding="BASE">
<SYNONYMS>
<SYNONYM term="3" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Audio" code="Audio" english_name="Audio" encoding="BASE">
<SYNONYMS>
<SYNONYM term="4" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Image/Video" code="Image/Video" english_name="Image/Video" encoding="BASE">
<SYNONYMS>
<SYNONYM term="5" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Still image" code="Still image" english_name="Still image" encoding="BASE">
<SYNONYMS>
<SYNONYM term="51" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Moving image/Video" code="Moving image/Video" english_name="Moving image/Video" encoding="BASE">
<SYNONYMS>
<SYNONYM term="52" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Software" code="Software" english_name="Software" encoding="BASE">
<SYNONYMS>
<SYNONYM term="6" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Dataset" code="Dataset" english_name="Dataset" encoding="BASE">
<SYNONYMS>
<SYNONYM term="7" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
<TERM native_name="Unknown" code="Unknown" english_name="Unknown" encoding="BASE">
<SYNONYMS>
<SYNONYM term="F" encoding="BASE" />
</SYNONYMS>
<RELATIONS />
</TERM>
</TERMS>
</CONFIGURATION>
<STATUS>
<LAST_UPDATE value="2013-11-18T10:46:36Z" />
</STATUS>
<SECURITY_PARAMETERS>String</SECURITY_PARAMETERS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -58,6 +58,9 @@ base_dc:link (I used dc:identifier)
-->
<xsl:variable name="varBaseNormType" select="vocabulary:clean(//base_dc:typenorm, 'base:normalized_types')" />
<metadata>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:title" />
@ -100,11 +103,15 @@ base_dc:link (I used dc:identifier)
<xsl:with-param name="targetElement" select="'dc:format'" />
</xsl:call-template>
<dc:type>
<xsl:value-of select="$varBaseNormType" />
</dc:type>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:type" />
<xsl:with-param name="targetElement" select="'dc:type'" />
</xsl:call-template>
<xsl:call-template name="allElements">
<xsl:with-param name="sourceElement" select="//dc:source" />
<xsl:with-param name="targetElement" select="'dc:source'" />
@ -202,8 +209,8 @@ base_dc:link (I used dc:identifier)
<!-- TODO: ADD VOCABULARY TERMS -->
<dr:CobjCategory>
<xsl:variable name="varCobjCategory" select="vocabulary:clean( //base_dc:typenorm, 'dnet:publication_resource')" />
<xsl:variable name="varSuperType" select="vocabulary:clean( $varCobjCategory, 'dnet:result_typologies')" />
<xsl:variable name="varCobjCategory" select="vocabulary:clean($varBaseNormType, 'dnet:publication_resource')" />
<xsl:variable name="varSuperType" select="vocabulary:clean($varCobjCategory, 'dnet:result_typologies')" />
<xsl:attribute name="type" select="$varSuperType" />
<xsl:value-of select="$varCobjCategory" />
</dr:CobjCategory>