'use strict'; import {properties} from "../../explore/src/environments/environment"; import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service"; import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview"; import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class"; import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields"; import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service"; import {ResultLandingComponent} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.component"; import {ResultLandingService} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.service"; import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/entities/resultLandingInfo"; const request = require('superagent'); function getForSubject(resultsPerUrl, resultsType, subject) { setTimeout(async () => { let allUrls = new Set(); let reqSubject: string = subjectMapping[subject]; if (reqSubject == null) { reqSubject = subject; } let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType; let totalResults: number = 150000; await new Promise((resolve, reject) => { request.get(url+"&size=0&page=0", function (err: any, response: any) { if (!response && err) { reject(err); } else { totalResults = response.body['meta']['total']; resolve(value => value); } }) }).catch(error => { console.error("Error getting results ", error); fs.appendFileSync("./" + errorFileName, "no response " + url + " "); fs.appendFileSync("./" + errorFileName, error); fs.appendFileSync("./" + errorFileName, "\n"); }) let pages: number = Math.ceil(totalResults/resultsPerUrl); console.log("totalResults="+totalResults + " - pages="+pages); for(let page=0; page setTimeout(resolve, 500)); promiseArray.push( new Promise((resolve, reject) => { request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) { if (!response && err) { reject(err); } else { parseAllUrls(response, allUrls, resultsType, subject); resolve(value => value); } }) }).catch(error => { console.error("Error getting results ", error); fs.appendFileSync("./" + errorFileName, "no response " + url + " "); fs.appendFileSync("./" + errorFileName, error); fs.appendFileSync("./" + errorFileName, "\n"); }) ); } finalize(resultsPerUrl, resultsType); }); } function get(resultsPerUrl, resultsType) { setTimeout(() => { let searchFields = new SearchFields(); let fieldIdsMap = searchFields.RESULT_FIELDS; refineUrl += resultsType; request.get(refineUrl, async function (err: any, refineResponse: any) { if (!refineResponse && err) { console.error("Error getting refine filters ",err); } else { let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null; console.log("number of keys: " + (keys? keys.length : 'error: no refine results returned')); let allUrls = new Set(); for (let key of keys) { // comment out for communities query check if(key == "community") { await communitiesPromise; } console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length); for (let value of refineResponse.body['refineResults'][key]) { if(!value || !value.name || !value.id || value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available') || value.name == "unidentified" || value.name == "Undetermined") { console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value)); continue; } if(key=="community") { let valueId = ""; if(value.id) { let idArray = value.id.split("||"); if(idArray) { valueId = idArray[0]; } } // if(!valueId) { if(!valueId || !publicCommunities.includes(valueId)) { // comment out for communities query check console.log("hidden community: "+valueId); continue; } } const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl; await new Promise(resolve => setTimeout(resolve, 500)); promiseArray.push( new Promise((resolve, reject) => { request.get(url, function (err: any, response: any) { if (!response && err) { reject(err); } else { parseAllUrls(response, allUrls, resultsType); resolve(value => value); } }) }).catch(error => { console.error("Error getting results ", error); fs.appendFileSync("./"+errorFileName, "no response "+url+" "); fs.appendFileSync("./"+errorFileName, error); fs.appendFileSync("./"+errorFileName, "\n"); })); } console.log(""); } finalize(resultsPerUrl, resultsType); } }) }) } // }); async function finalize(resultsPerUrl, resultsType) { await Promise.all(promiseArray); console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin); console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls); console.log("\nPublications: " + publications + " - Datasets: " + datasets + " - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid); fs.appendFile("./" + fileName, "\n", function (err) { if (err) { return console.log("Error appending in file " + fileName + ": ", err); } console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); }); } function parseAllUrls_old(response: any, allUrls: any) { // let allUrls: any = []; let responses: any = response.body['results']; let searchResearchResultsService: any = new SearchResearchResultsService(); // if(responses) { // let length = Array.isArray(responses) ? responses.length : 1; // for (let i = 0; i < length; i++) { // let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; // // if (resData['pid']) { // if (!Array.isArray(resData['pid'])) { // if (resData['pid'].classid && resData['pid'].classid == 'doi') { // if (resData['pid'].content != '' && resData['pid'].content != null) { // console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content)); // resData['pid'].content.replace("https://doi.org/", ""); // } // } // } // } // } // } let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties); if(searchResults.length < 100 && searchResults.length > 0) { console.log("num of results: "+searchResults.length + " " + response.request.url); } if(searchResults.length == 0) { fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n"); } for(let j=0; j 0) { console.log("num of results: "+length + " " + response.request.url); } if(length == 0) { fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n"); } for (let i = 0; i < length; i++) { let curResponse = Array.isArray(responses) ? responses[i] : responses; let resData = curResponse['result']['metadata']['oaf:entity']; let param = [resData['oaf:result'], resData['oaf:result']['title'], resData['oaf:result']['rels']['rel'], resData['oaf:result']['children'], resData['oaf:result']['pid'], resData['oaf:result']['journal'], resData['oaf:result']['language'], resData['oaf:result']['subject'], resData['oaf:result']['context'], resData['oaf:result']['creator'], resData['oaf:result']['country'], resData['oaf:result']['programmingLanguage'], (resData['extraInfo'] !== undefined && resData['extraInfo']['citations'] !== undefined) ? resData['extraInfo']['citations']['citation'] : null, curResponse['result']['header']['dri:status'], curResponse]; let resultLandingService = new ResultLandingService(null); let resultLandingInfo: ResultLandingInfo = resultLandingService.parseResultLandingInfo(param, null, properties); let resultPreview: any = ResultPreview.resultLandingInfoConvert(resultLandingInfo, resultLandingInfo.resultType); let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers); let url; if(pid && pid.id) { url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id)); } else { url = getUrlByType(resultPreview.resultType, null, resultLandingInfo.relcanId); } if(allUrls.has(url)) { alreadyin++; } else { let resultLandingComponent = new ResultLandingComponent(null, null, null, null, null, null, null, null, null, null, null, null, null, null); if(!resultLandingComponent.checkIfAllowed(resultLandingInfo)) { noIndexedUrls++; fs.appendFileSync("./"+noIndexFileName, url+"\n"); } else { if(finalUrls > 0 && ((finalUrls % 50000) == 0)) { console.log("url to be added in file: "+url); fs.appendFileSync("./" + fileName, "\n"); console.log("\n"); createSitemapFile(resultsType, subject); } finalUrls++; let urlPre = "\n"; let urlSuf = ""; fs.appendFileSync("./"+fileName, urlPre + url + urlSuf); } allUrls.add(url); notin++; } // .pipe(map(res => [res['result']['header']['dri:status'], res['result']['metadata']['oaf:entity'], res])) // .pipe(map(res => [ // res[1]['oaf:result'], // 0 // res[1]['oaf:result']['title'], // 1 // res[1]['oaf:result']['rels']['rel'], // 2 // res[1]['oaf:result']['children'], // 3 // res[1]['oaf:result']['pid'], // 4 // res[1]['oaf:result']['journal'], // 5 // res[1]['oaf:result']['language'], // 6 // res[1]['oaf:result']['subject'], // 7 // res[1]['oaf:result']['context'], // 8 // res[1]['oaf:result']['creator'], // 9 // res[1]['oaf:result']['country'] , // 10 // res[1]['oaf:result']['programmingLanguage'], // 11 - software // //res[1]['oaf:result']['resulttype'], // (res[1]['extraInfo'] !== undefined && res[1]['extraInfo']['citations'] !== undefined) // ? res[1]['extraInfo']['citations']['citation'] : null, // 12 // res[0], // 13 // res[2] // 14 // ])) // .pipe(map(res => this.parseResultLandingInfo(res, provenanceActionVocabulary, properties))); } return allUrls; } // function parseAllUrls1(response) { // let allUrls = []; // // let responses = response.body['results']; // let length = Array.isArray(responses) ? responses.length : 1; // // for (let i = 0; i < length; i++) { // let p = new parsingFunctions.ParsingFunctions(); // let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result']; // // let type = "result"; // if (resData['resulttype']) { // type = resData['resulttype']['classname']; // } // // if (resData['pid']) { // let identifiers = p.parseIdentifiers(resData['pid']); // let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers); // // if(pid && pid.id) { // allUrls[i] = getUrlByType(type, pid, pid.id); // } else { // let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); // allUrls[i] = getUrlByType(type, null, canId); // } // } else { // let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result"); // allUrls[i] = getUrlByType(type, null, canId); // } // } // return allUrls; // } // function getUrlByType(type: any, pid: any, id: any) { let parameter: any = ""; if (type === "publication") { publications++; parameter = "articleId"; } else if (type === "dataset") { datasets++; parameter = "datasetId"; } else if (type === "software") { software++; parameter = "softwareId"; } else if (type === "other") { other++; parameter = "orpId"; } else { parameter = "id"; } if(pid) { urlsWithPid++; parameter = "pid"; } return landingPrefix+type+"?"+parameter+"="+id; } // comment out for communities query check function getCommunities() { communitiesPromise = new Promise((resolve, reject) => { request.get(contextUrl, async function (err: any, communitiesResponse: any) { if (!communitiesResponse && err) { reject(err); } else { const contextsService = new ContextsService(); publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id); resolve(value => value); } }) }).catch(error => console.error("Error getting communities ", error)); } function buildSiteMap(resultsPerUrl, resultsType, subject = null) { console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)"); let date = new Date(); createSitemapFile(resultsType, subject); errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime(); if(subject) { getForSubject(resultsPerUrl, resultsType, subject); } else { getCommunities(); // comment out for communities query check get(resultsPerUrl, resultsType); } } async function createSitemapFile(resultsType, subject=null) { // let date = new Date(); fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "") // + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".xml";//+"_"+date.getTime(); filesCreated++; console.log("Buiding sitemap in file: " + fileName + "\n"); let sitemap = "\n" + ""; fs.writeFileSync("./" + fileName, sitemap); } function askQuestion(query) { const readline = require('readline'); const rl = readline.createInterface({ input: process.stdin, output: process.stdout, }); return new Promise(resolve => rl.question(query, ans => { rl.close(); resolve(ans); })) } async function start(resultsType, subject = null) { if(resultsType == null) { resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): "); } console.log("type is: " + resultsType); if(subject == null) { subject = await askQuestion("Please provide subject. " + "Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " + "any other subject you want or no value if no subject: "); if(!subject) { subject = null; } } console.log("subject is: " + subject); buildSiteMap(200, resultsType, subject); } let filesCreated = 0; let fileName; let errorFileName; let noIndexFileName; const fs = require('fs'); let promiseArray = []; let alreadyin = 0; // duplicate urls let notin= 0; let finalUrls = 0; let noIndexedUrls = 0; let urlsWithPid = 0; let publications = 0; let datasets = 0; let software = 0; let other = 0; // comment out for communities query check let communitiesPromise; let publicCommunities = []; const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json"; const landingPrefix = "https://explore.openaire.eu/search/"; // let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type="; // comment out for communities query check let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type="; const contextUrl = "https://services.openaire.eu/openaire/contexts/"; const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"} // process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument start(process.argv[3], process.argv[4]);