diff --git a/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java new file mode 100644 index 000000000..233e50d6e --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/test/java/eu/dnetlib/dhp/bmuse/bioschema/Html2TriplesTest.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.bmuse.bioschema; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import org.apache.any23.Any23; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.source.DocumentSource; +import org.apache.any23.source.StringDocumentSource; +import org.apache.any23.writer.NTriplesWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.output.ByteArrayOutputStream; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class Html2TriplesTest { + + static Logger logger = LoggerFactory.getLogger(Html2TriplesTest.class); + + @Test +// @Disabled + void conversionTest() throws Exception { + InputStream is = Html2TriplesTest.class.getResourceAsStream("/eu/dnetlib/dhp/bmuse/bioschema/ped.html"); + String page = IOUtils.toString(is, StandardCharsets.UTF_8.name()); + DocumentSource source = new StringDocumentSource(page, "https://proteinensemble.org/PED00001"); + Any23 runner = new Any23(); + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + TripleHandler handler = new NTriplesWriter(out);) { + runner.extract(source, handler); + logger.info(out.toString("UTF-8")); + } catch (ExtractionException e) { + logger.error("Cannot extract triples", e); + } catch (IOException e1) { + logger.error(" IO error whilst extracting triples", e1); + } catch (TripleHandlerException e2) { + logger.error("TripleHanderException", e2); + } + + } +} diff --git a/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html b/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html new file mode 100644 index 000000000..e9ca13caa --- /dev/null +++ b/dhp-workflows/dhp-bmuse/src/test/resources/eu/dnetlib/dhp/bmuse/bioschema/ped.html @@ -0,0 +1,37 @@ + + + PED + + + + + + + + + + + + + + + + + + + + +

PED00001 - Structural ensemble of pSic1 (1-90) with phosphorylations at Thr5, Thr33, Thr45, Ser69, Ser76, Ser80

Experiments' raw data bmrb:16659 link
Publication
Structure/function implications in a dynamic complex of the intrinsically disordered Sic1 with the Cdc4 subunit of an SCF ubiquitin ligase. Mittag T, Marsh J, Grishaev A, Orlicky S, Lin H, Sicheri F, Tyers M, Forman-Kay JD. Structure, 2010pubmed:20399186

NMR Experiments: All NMR data were collected on Varian Inova 500 MHz, 600 MHz and 800 MHz spectrometers at 5 ?C. The NMR samples were prepared in PBS and spectra were processed and analysed using NMRPipe/NMR Draw. Assignments and relaxation experiments were reported previously. NMR data is deposited in the BMRB with accession codes 16657 and 16659 for Sic1 and pSic1, respectively. PRE experiments and NH R2 NMR experiments were performed . The paramagnetic contribution to the transverse relaxation rate, (i.e. the paramagnetic relaxation enhancement, PRE), is the difference between transverse relaxation rates in paramagnetic and diamagnetic states. 1DHN RDCs were measured on 0.3 mM and 0.2 mM Sic1 and pSic1 samples, respectively. Couplings were extracted using ?Fuda: A function and data fitting and analysis package?. Errors were calculated from at least duplicate data sets. SAXS data collection: Small angle x-ray scattering data were acquired at the Beam Line 12-IDC at the Advanced Light Source synchrotron (Argonne National Laboratory, Argonne, IL). A total of 20 sequential data frames with exposure times of 0.25 seconds were recorded. Samples and buffers were flowing during data collection to prevent radiation damage. Individual data frames were converted from 2D to 1D profiles and normalized by the corresponding incident beam intensities. The final 1D scattering profiles and their uncertainties were then calculated as means and standard deviations over the 20 frames and then buffer data were subtracted from the sample data.

Ensemble models of intrinsically disordered Sic1 and pSic1 were calculated using essentially the same approach as was described (Marsh and Forman-Kay, 2009). Distance restraints were calculated from PRE measurements. SAXS profiles of the experimentally restrained ensembles were calculated by predicting scattering curves for each individual member using the program CRYSOL (Svergun et al., 1995) and averaged over the members of the ensemble. Chemical shifts were calculated from individual conformers using SHIFTX (Neal et al., 2003). RDCs were calculated using a local alignment approach, in which local alignment tensors are calculated for 15 residue fragments of the sequence in a sliding window fashion (Marsh et al., 2008). 15N R2 relaxation rates were compared to the number of heavy atoms in an 8 A ? radius of each measured nucleus, as previously described (Marsh and Forman-Kay, 2009). + The Sic1 and pSic1 ensemble models comprised residues 1-90 of the full-length Sic1 amino acid sequence plus an N-terminal Gly-Ser sequence remaining after tag cleavage. Glutamate residues were used to represent the phosphorylated residues in pSic1 to facilitate use of TraDES (Feldman and Hogue, 2000). These glutamate residues were converted to the proper phosphorylated threonine or serine residues for electrostatic calculations. Three independent ensembles were calculated for each of free Sic1 and pSic1 and the pSic1 complex. Calculations were performed on a cluster of CPUs, with one main node performing the core conformational selection calculations and 8-12 nodes performing the iterative conformational sampling with CNS (Bru? nger et al., 1998), Unfoldtraj, and TraDES (Feldman and Hogue, 2000). The initial temperature for the ENSEMBLE calculations was set to 10,000 and decreased to 0.01 in 200,000 steps. The starting ensembles contained 200 structures and the number of conformers comprising the ensembles was decreased by one after each successful ENSEMBLE calculation in which full agreement with experimental restraints was achieved. Calculations were stopped when a smaller ensemble could not be successfully calculated within 72 hr.

Cross reference disprot:DP00631 link
102030405060708090P38634 GSMTPSTPPRSRGTRYLAQPSGNTSSSALMQGQKTPQKPSQNLVPVTPSTTKSFKNAPLLAPPNSNMGMTSPFNGLTSPQRSPFPKSSVKRTChain APTMsPED00001e001 Secondary structure entropyPED00001e001 Relative ASA PED00001e002 Secondary structure entropyPED00001e002 Relative ASA PED00001e003 Secondary structure entropyPED00001e003 Relative ASA

Deposited ensemble

Ensemble ID PED00001e001 Number of models 11

Chain: A
Secondary structure entropy 0.40Relative solvent accessibility 0.68 Radius of gyration 26.74
Parsing... [0/1481071]

Ensemble ID PED00001e002 Number of models 10

Chain: A
Secondary structure entropy 0.39Relative solvent accessibility 0.70 Radius of gyration 26.71
Parsing... [1485862/1485862]

Ensemble ID PED00001e003 Number of models 11

Chain: A
Secondary structure entropy 0.44Relative solvent accessibility 0.70 Radius of gyration 28.15
Model 1 / 10
+ + + \ No newline at end of file