package eu.dnetlib.ariadneplus; import eu.dnetlib.ariadneplus.elasticsearch.BulkUpload; import eu.dnetlib.ariadneplus.reader.ResourceManager; import eu.dnetlib.ariadneplus.reader.RunSPARQLQueryService; import eu.dnetlib.ariadneplus.reader.json.ParseRDFJSON; import eu.dnetlib.ariadneplus.reader.utils.ESUtils; import org.apache.http.HttpHost; import org.elasticsearch.action.get.GetRequest; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.search.fetch.subphase.FetchSourceContext; import org.junit.After; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; import org.springframework.core.io.ClassPathResource; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.List; import java.util.Properties; @Ignore public class FindMissingRecordsInIndexTest { private RunSPARQLQueryService runSPQRLQuery; private final static String STAGING_PROPERTIES = "application.properties"; private final static String PROD_PROPERTIES = "application-prod-DO-NOT-COMMIT.properties"; private RestHighLevelClient client; private Properties appProps; @Before public void setUp() throws IOException { final ClassPathResource resource = new ClassPathResource(STAGING_PROPERTIES); appProps = new Properties(); appProps.load(resource.getInputStream()); runSPQRLQuery = new RunSPARQLQueryService(); runSPQRLQuery.setupReadOnlyConnection( appProps.getProperty("graphdb.serverUrl"), appProps.getProperty("graphdb.repository")); client = new RestHighLevelClient( RestClient.builder( new HttpHost(appProps.getProperty("elasticsearch.hostname"), 9200, "http"))); } @After public void tearDown() throws IOException { client.close(); } @Test public void findMissingRecordsTest() throws NoSuchAlgorithmException, IOException { String datasource = "ads"; String collectionId = "archives"; List uris = runSPQRLQuery.selectRecordIds(datasource, collectionId); System.out.println("Got list of ids, they are "+uris.size()); for(String uri : uris){ //compute the md5 and query the index to check if there is a record with that uri. If not, print it MessageDigest digest = MessageDigest.getInstance("SHA-256"); byte[] encodedhash = digest.digest( uri.getBytes(StandardCharsets.UTF_8)); String idES = ESUtils.bytesToHex(encodedhash); // High level API GetRequest getRequest = new GetRequest( appProps.getProperty("elasticsearch.indexname"), idES); getRequest.fetchSourceContext(new FetchSourceContext(false)); getRequest.storedFields("_none_"); boolean exists = client.exists(getRequest, RequestOptions.DEFAULT); if(!exists) System.out.println(uri); } System.out.println("Done"); } }