explore-services/services/sitemaps/extractUrlsFromSearch.ts

257 lines
9.4 KiB
TypeScript

'use strict';
import {properties} from "../../explore/src/environments/environment";
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
const request = require('superagent');
function get(resultsPerUrl) {
setTimeout(() => {
let searchFields = new SearchFields();
let fieldIdsMap = searchFields.RESULT_FIELDS;
request.get(refineUrl, async function (err: any, refineResponse: any) {
if (!refineResponse && err) {
console.error("Error getting refine filters ",err);
} else {
let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
console.log("number of keys: " + keys.length);
let allUrls = new Set();
let promiseArray = [];
for (let key of keys) {
if(key == "community") {
await communitiesPromise;
}
console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
for (let value of refineResponse.body['refineResults'][key]) {
if(!value || !value.name || !value.id
|| value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
|| value.name == "unidentified" || value.name == "Undetermined") {
console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
continue;
}
if(key=="community") {
let valueId = "";
if(value.id) {
let idArray = value.id.split("||");
if(idArray) {
valueId = idArray[0];
}
}
if(!valueId || !publicCommunities.includes(valueId)) {
console.log("hidden community: "+valueId);
continue;
}
}
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
await new Promise(resolve => setTimeout(resolve, 500));
promiseArray.push(
new Promise((resolve, reject) => {
request.get(url, function (err: any, response: any) {
if (!response && err) {
reject(err);
} else {
parseAllUrls(response, allUrls);
resolve();
}
})
}).catch(error => {
console.error("Error getting results ", error);
fs.appendFileSync("./"+errorFileName, "no response "+url+" ");
fs.appendFileSync("./"+errorFileName, error);
fs.appendFileSync("./"+errorFileName, "\n");
}));
}
console.log("");
}
await Promise.all(promiseArray);
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
if (err) {
return console.log("Error appending in file "+fileName+": ", err);
}
console.timeEnd("total_time");
});
}
})
})
}
// });
function parseAllUrls(response: any, allUrls: any) {
// let allUrls: any = [];
let responses: any = response.body['results'];
let searchResearchResultsService: any = new SearchResearchResultsService();
// if(responses) {
// let length = Array.isArray(responses) ? responses.length : 1;
// for (let i = 0; i < length; i++) {
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
//
// if (resData['pid']) {
// if (!Array.isArray(resData['pid'])) {
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
// if (resData['pid'].content != '' && resData['pid'].content != null) {
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
// resData['pid'].content.replace("https://doi.org/", "");
// }
// }
// }
// }
// }
// }
let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
if(searchResults.length < 100 && searchResults.length > 0) {
console.log("num of results: "+searchResults.length + " " + response.request.url);
}
if(searchResults.length == 0) {
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n");
}
for(let j=0; j<searchResults.length; j++) {
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
let url;
if(pid && pid.id) {
url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
} else {
url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
}
if(allUrls.has(url)) {
alreadyin++;
} else {
allUrls.add(url);
let urlPre = "\n<url><loc>";
let urlSuf = "</loc></url>";
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
notin++;
}
}
return allUrls;
}
// function parseAllUrls1(response) {
// let allUrls = [];
//
// let responses = response.body['results'];
// let length = Array.isArray(responses) ? responses.length : 1;
//
// for (let i = 0; i < length; i++) {
// let p = new parsingFunctions.ParsingFunctions();
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
//
// let type = "result";
// if (resData['resulttype']) {
// type = resData['resulttype']['classname'];
// }
//
// if (resData['pid']) {
// let identifiers = p.parseIdentifiers(resData['pid']);
// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
//
// if(pid && pid.id) {
// allUrls[i] = getUrlByType(type, pid, pid.id);
// } else {
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
// allUrls[i] = getUrlByType(type, null, canId);
// }
// } else {
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
// allUrls[i] = getUrlByType(type, null, canId);
// }
// }
// return allUrls;
// }
//
function getUrlByType(type: any, pid: any, id: any) {
let parameter: any = "";
if (type === "publication") {
parameter = "articleId";
} else if (type === "dataset") {
parameter = "datasetId";
} else if (type === "software") {
parameter = "softwareId";
} else if (type === "other") {
parameter = "orpId";
} else {
parameter = "id";
}
if(pid) {
parameter = "pid";
}
return landingPrefix+type+"?"+parameter+"="+id;
}
function getCommunities() {
communitiesPromise = new Promise((resolve, reject) => {
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
if (!communitiesResponse && err) {
reject(err);
} else {
const contextsService = new ContextsService();
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
resolve();
}
})
}).catch(error => console.error("Error getting communities ", error));
}
function buildSiteMap(resultsPerUrl) {
console.time("total_time");
let date = new Date();
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
console.log("Buiding sitemap in file: "+fileName+"\n");
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
fs.writeFile("./"+fileName, sitemap, function(err) {
if(err) {
return console.log("Error writing in file "+fileName+": ", err);
}
});
getCommunities();
get(resultsPerUrl);
}
let fileName;
let errorFileName;
const fs = require('fs');
let alreadyin = 0; // duplicate urls
let notin= 0;
let communitiesPromise;
let publicCommunities = [];
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
const landingPrefix = "https://explore.openaire.eu/search/";
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
buildSiteMap(100);