package eu.dnetlib.dhp.bmuse.bioschema; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import org.apache.any23.Any23; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.source.DocumentSource; import org.apache.any23.source.StringDocumentSource; import org.apache.any23.writer.NTriplesWriter; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; import org.apache.commons.io.IOUtils; import org.apache.commons.io.output.ByteArrayOutputStream; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class Html2TriplesTest { static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class); @Test // @Disabled void conversionTest() throws Exception { InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html"); String page = IOUtils.toString(is, StandardCharsets.UTF_8.name()); DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001"); Any23 runner = new Any23(); try (ByteArrayOutputStream out = new ByteArrayOutputStream(); TripleHandler handler = new NTriplesWriter(out);) { runner.extract(source, handler); logger.info(out.toString("UTF-8")); } catch (ExtractionException e) { logger.error("Cannot extract triples", e); } catch (IOException e1) { logger.error(" IO error whilst extracting triples", e1); } catch (TripleHandlerException e2) { logger.error("TripleHanderException", e2); } } }