'use strict'; import {properties} from "../../explore/src/environments/environment"; import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service"; import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview"; import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class"; import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields"; import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service"; const request = require('superagent'); function get(resultsPerUrl) { setTimeout(() => { let searchFields = new SearchFields(); let fieldIdsMap = searchFields.RESULT_FIELDS; request.get(refineUrl, async function (err: any, refineResponse: any) { if (!refineResponse && err) { console.error("Error getting refine filters ",err); } else { let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null; console.log("number of keys: " + keys.length); let allUrls = new Set(); var promiseArray = []; for (let key of keys) { if(key == "community") { await communitiesPromise; } console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length); for (let value of refineResponse.body['refineResults'][key]) { if(!value || !value.name || !value.id || value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available') || value.name == "unidentified" || value.name == "Undetermined") { console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value)); continue; } if(key=="community") { let valueId = ""; if(value.id) { let idArray = value.id.split("||"); if(idArray) { valueId = idArray[0]; } } if(!valueId || !publicCommunities.includes(valueId)) { console.log("hidden community: "+valueId); continue; } } const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl; promiseArray.push(new Promise((resolve, reject) => { request.get(url, function (err: any, response: any) { if (!response && err) { console.error("Error getting results ", err); fs.appendFileSync("./"+errorFileName, "no response "+url); fs.appendFileSync("./"+errorFileName, err); fs.appendFileSync("./"+errorFileName, "\n"); reject(); } else { parseAllUrls(response, allUrls); resolve(); } }) })); } console.log(""); } await Promise.all(promiseArray); console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin); fs.appendFile("./" + fileName, "\n", function (err) { if (err) { return console.log("Error appending in file "+fileName+": ", err); } console.timeEnd("total_time"); }); } }) }) } // }); function parseAllUrls(response: any, allUrls: any) { // let allUrls: any = []; let responses: any = response.body['results']; let searchResearchResultsService: any = new SearchResearchResultsService(); // if(responses) { // let length = Array.isArray(responses) ? responses.length : 1; // for (let i = 0; i < length; i++) { // let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; // // if (resData['pid']) { // if (!Array.isArray(resData['pid'])) { // if (resData['pid'].classid && resData['pid'].classid == 'doi') { // if (resData['pid'].content != '' && resData['pid'].content != null) { // console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content)); // resData['pid'].content.replace("https://doi.org/", ""); // } // } // } // } // } // } let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties); if(searchResults.length < 100 && searchResults.length > 0) { console.log("num of results: "+searchResults.length + " " + response.request.url); } if(searchResults.length == 0) { fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n"); } for(let j=0; j"; let urlSuf = "\n" + " "; fs.appendFileSync("./"+fileName, urlPre + url + urlSuf); notin++; } } return allUrls; } // function parseAllUrls1(response) { // let allUrls = []; // // let responses = response.body['results']; // let length = Array.isArray(responses) ? responses.length : 1; // // for (let i = 0; i < length; i++) { // let p = new parsingFunctions.ParsingFunctions(); // let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; // // let type = "result"; // if (resData['resulttype']) { // type = resData['resulttype']['classname']; // } // // if (resData['pid']) { // let identifiers = p.parseIdentifiers(resData['pid']); // let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers); // // if(pid && pid.id) { // allUrls[i] = getUrlByType(type, pid, pid.id); // } else { // let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); // allUrls[i] = getUrlByType(type, null, canId); // } // } else { // let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); // allUrls[i] = getUrlByType(type, null, canId); // } // } // return allUrls; // } // function getUrlByType(type: any, pid: any, id: any) { let parameter: any = ""; if (type === "publication") { parameter = "articleId"; } else if (type === "dataset") { parameter = "datasetId"; } else if (type === "software") { parameter = "softwareId"; } else if (type === "other") { parameter = "orpId"; } else { parameter = "id"; } if(pid) { parameter = "pid"; } return landingPrefix+type+"?"+parameter+"="+id; } function getCommunities() { communitiesPromise = new Promise((resolve, reject) => { request.get(contextUrl, async function (err: any, communitiesResponse: any) { if (!communitiesResponse && err) { console.error("Error getting communities ", err); reject(); } else { const contextsService = new ContextsService(); publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id); resolve(); } }) }); } function buildSiteMap(resultsPerUrl) { console.time("total_time"); let date = new Date(); fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime(); errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); console.log("Buiding sitemap in file: "+fileName+"\n"); let sitemap = "\n" + ""; fs.writeFile("./"+fileName, sitemap, function(err) { if(err) { return console.log("Error writing in file "+fileName+": ", err); } }); getCommunities(); get(resultsPerUrl); } var fileName; var errorFileName; const fs = require('fs'); var alreadyin = 0; // duplicate urls var notin= 0; var communitiesPromise; var publicCommunities = []; const refineUrl = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; const resultsUrlPrefix = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json"; const landingPrefix = "https://beta.explore.openaire.eu/search/"; const contextUrl = "https://beta.services.openaire.eu/openaire/contexts/"; buildSiteMap(100);