483 lines
19 KiB
TypeScript
483 lines
19 KiB
TypeScript
'use strict';
|
|
|
|
import {properties} from "../../explore/src/environments/environment";
|
|
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
|
|
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
|
|
|
|
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
|
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
|
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
|
import {ResultLandingComponent} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.component";
|
|
import {ResultLandingService} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.service";
|
|
import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/entities/resultLandingInfo";
|
|
|
|
const request = require('superagent');
|
|
|
|
function getForSubject(resultsPerUrl, resultsType, subject) {
|
|
setTimeout(async () => {
|
|
let allUrls = new Set();
|
|
|
|
let reqSubject: string = subjectMapping[subject];
|
|
if (reqSubject == null) {
|
|
reqSubject = subject;
|
|
}
|
|
|
|
let url = resultsUrlPrefix + "&query=(" + "resultsubject exact" + " \"" + encodeURIComponent(reqSubject) + "\")" + "&type=" + resultsType;
|
|
|
|
let totalResults: number = 150000;
|
|
await new Promise((resolve, reject) => {
|
|
request.get(url+"&size=0&page=0", function (err: any, response: any) {
|
|
if (!response && err) {
|
|
reject(err);
|
|
} else {
|
|
totalResults = response.body['meta']['total'];
|
|
resolve(value => value);
|
|
}
|
|
})
|
|
}).catch(error => {
|
|
console.error("Error getting results ", error);
|
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
|
fs.appendFileSync("./" + errorFileName, error);
|
|
fs.appendFileSync("./" + errorFileName, "\n");
|
|
})
|
|
|
|
let pages: number = Math.ceil(totalResults/resultsPerUrl);
|
|
console.log("totalResults="+totalResults + " - pages="+pages);
|
|
for(let page=0; page<pages; page++) {
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
|
|
promiseArray.push(
|
|
new Promise((resolve, reject) => {
|
|
request.get(url+"&size="+resultsPerUrl+"&page="+page, function (err: any, response: any) {
|
|
if (!response && err) {
|
|
reject(err);
|
|
} else {
|
|
parseAllUrls(response, allUrls, resultsType, subject);
|
|
resolve(value => value);
|
|
}
|
|
})
|
|
}).catch(error => {
|
|
console.error("Error getting results ", error);
|
|
fs.appendFileSync("./" + errorFileName, "no response " + url + " ");
|
|
fs.appendFileSync("./" + errorFileName, error);
|
|
fs.appendFileSync("./" + errorFileName, "\n");
|
|
})
|
|
);
|
|
}
|
|
|
|
finalize(resultsPerUrl, resultsType);
|
|
});
|
|
}
|
|
|
|
function get(resultsPerUrl, resultsType) {
|
|
setTimeout(() => {
|
|
let searchFields = new SearchFields();
|
|
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
|
|
|
refineUrl += resultsType;
|
|
|
|
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
|
if (!refineResponse && err) {
|
|
console.error("Error getting refine filters ",err);
|
|
} else {
|
|
let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
|
|
console.log("number of keys: " + (keys? keys.length : 'error: no refine results returned'));
|
|
|
|
let allUrls = new Set();
|
|
|
|
for (let key of keys) {
|
|
// comment out for communities query check
|
|
if(key == "community") {
|
|
await communitiesPromise;
|
|
}
|
|
console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
|
|
|
|
for (let value of refineResponse.body['refineResults'][key]) {
|
|
if(!value || !value.name || !value.id
|
|
|| value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
|
|
|| value.name == "unidentified" || value.name == "Undetermined") {
|
|
console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
|
|
continue;
|
|
}
|
|
|
|
if(key=="community") {
|
|
let valueId = "";
|
|
if(value.id) {
|
|
let idArray = value.id.split("||");
|
|
if(idArray) {
|
|
valueId = idArray[0];
|
|
}
|
|
}
|
|
// if(!valueId) {
|
|
if(!valueId || !publicCommunities.includes(valueId)) { // comment out for communities query check
|
|
console.log("hidden community: "+valueId);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl;
|
|
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
|
promiseArray.push(
|
|
new Promise((resolve, reject) => {
|
|
request.get(url, function (err: any, response: any) {
|
|
if (!response && err) {
|
|
reject(err);
|
|
} else {
|
|
parseAllUrls(response, allUrls, resultsType);
|
|
resolve(value => value);
|
|
}
|
|
})
|
|
}).catch(error => {
|
|
console.error("Error getting results ", error);
|
|
fs.appendFileSync("./"+errorFileName, "no response "+url+" ");
|
|
fs.appendFileSync("./"+errorFileName, error);
|
|
fs.appendFileSync("./"+errorFileName, "\n");
|
|
}));
|
|
}
|
|
console.log("");
|
|
}
|
|
|
|
finalize(resultsPerUrl, resultsType);
|
|
}
|
|
})
|
|
})
|
|
}
|
|
// });
|
|
|
|
async function finalize(resultsPerUrl, resultsType) {
|
|
await Promise.all(promiseArray);
|
|
console.log("\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin);
|
|
console.log("\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls);
|
|
console.log("\nPublications: " + publications + " - Datasets: " + datasets +
|
|
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid);
|
|
|
|
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
|
if (err) {
|
|
return console.log("Error appending in file " + fileName + ": ", err);
|
|
}
|
|
console.timeEnd("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
|
});
|
|
}
|
|
|
|
function parseAllUrls_old(response: any, allUrls: any) {
|
|
// let allUrls: any = [];
|
|
|
|
let responses: any = response.body['results'];
|
|
let searchResearchResultsService: any = new SearchResearchResultsService();
|
|
|
|
// if(responses) {
|
|
// let length = Array.isArray(responses) ? responses.length : 1;
|
|
// for (let i = 0; i < length; i++) {
|
|
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
|
//
|
|
// if (resData['pid']) {
|
|
// if (!Array.isArray(resData['pid'])) {
|
|
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
|
|
// if (resData['pid'].content != '' && resData['pid'].content != null) {
|
|
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
|
|
// resData['pid'].content.replace("https://doi.org/", "");
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
// }
|
|
|
|
|
|
let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
|
|
if(searchResults.length < 100 && searchResults.length > 0) {
|
|
console.log("num of results: "+searchResults.length + " " + response.request.url);
|
|
}
|
|
|
|
if(searchResults.length == 0) {
|
|
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n");
|
|
}
|
|
for(let j=0; j<searchResults.length; j++) {
|
|
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
|
|
|
let pid: any = Identifier.getPIDFromIdentifiers(resultPreview.identifiers);
|
|
let url;
|
|
if(pid && pid.id) {
|
|
url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
|
|
} else {
|
|
url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
|
|
}
|
|
if(allUrls.has(url)) {
|
|
alreadyin++;
|
|
} else {
|
|
allUrls.add(url);
|
|
let urlPre = "\n<url><loc>";
|
|
let urlSuf = "</loc></url>";
|
|
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
|
notin++;
|
|
}
|
|
}
|
|
return allUrls;
|
|
}
|
|
|
|
|
|
function parseAllUrls(response: any, allUrls: any, resultsType: string, subject: string = null) {
|
|
// let allUrls: any = [];
|
|
|
|
let responses: any = response.body['results'];
|
|
let length = responses ? (Array.isArray(responses) ? responses.length : 1) : 0;
|
|
|
|
if(length < 100 && length > 0) {
|
|
console.log("num of results: "+length + " " + response.request.url);
|
|
}
|
|
|
|
if(length == 0) {
|
|
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n");
|
|
}
|
|
|
|
for (let i = 0; i < length; i++) {
|
|
let curResponse = Array.isArray(responses) ? responses[i] : responses;
|
|
let resData = curResponse['result']['metadata']['oaf:entity'];
|
|
|
|
let param = [resData['oaf:result'], resData['oaf:result']['title'], resData['oaf:result']['rels']['rel'],
|
|
resData['oaf:result']['children'], resData['oaf:result']['pid'], resData['oaf:result']['journal'],
|
|
resData['oaf:result']['language'], resData['oaf:result']['subject'], resData['oaf:result']['context'],
|
|
resData['oaf:result']['creator'], resData['oaf:result']['country'], resData['oaf:result']['programmingLanguage'],
|
|
(resData['extraInfo'] !== undefined && resData['extraInfo']['citations'] !== undefined)
|
|
? resData['extraInfo']['citations']['citation'] : null,
|
|
curResponse['result']['header']['dri:status'], curResponse];
|
|
|
|
let resultLandingService = new ResultLandingService(null);
|
|
let resultLandingInfo: ResultLandingInfo = resultLandingService.parseResultLandingInfo(param, null, properties);
|
|
|
|
let resultPreview: any = ResultPreview.resultLandingInfoConvert(resultLandingInfo, resultLandingInfo.resultType);
|
|
|
|
let pid: any = Identifier.getPIDFromIdentifiers(resultPreview.identifiers);
|
|
let url;
|
|
if(pid && pid.id) {
|
|
url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
|
|
} else {
|
|
url = getUrlByType(resultPreview.resultType, null, resultLandingInfo.relcanId);
|
|
}
|
|
if(allUrls.has(url)) {
|
|
alreadyin++;
|
|
} else {
|
|
let resultLandingComponent = new ResultLandingComponent(null, null, null,
|
|
null, null, null, null, null, null,
|
|
null, null, null, null, null);
|
|
|
|
if(!resultLandingComponent.checkIfAllowed(resultLandingInfo)) {
|
|
noIndexedUrls++;
|
|
fs.appendFileSync("./"+noIndexFileName, url+"\n");
|
|
} else {
|
|
if(finalUrls > 0 && ((finalUrls % 50000) == 0)) {
|
|
console.log("url to be added in file: "+url);
|
|
fs.appendFileSync("./" + fileName, "\n</urlset>");
|
|
console.log("\n");
|
|
createSitemapFile(resultsType, subject);
|
|
}
|
|
|
|
finalUrls++;
|
|
let urlPre = "\n<url><loc>";
|
|
let urlSuf = "</loc></url>";
|
|
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
|
}
|
|
|
|
allUrls.add(url);
|
|
notin++;
|
|
}
|
|
|
|
// .pipe(map(res => [res['result']['header']['dri:status'], res['result']['metadata']['oaf:entity'], res]))
|
|
// .pipe(map(res => [
|
|
// res[1]['oaf:result'], // 0
|
|
// res[1]['oaf:result']['title'], // 1
|
|
// res[1]['oaf:result']['rels']['rel'], // 2
|
|
// res[1]['oaf:result']['children'], // 3
|
|
// res[1]['oaf:result']['pid'], // 4
|
|
// res[1]['oaf:result']['journal'], // 5
|
|
// res[1]['oaf:result']['language'], // 6
|
|
// res[1]['oaf:result']['subject'], // 7
|
|
// res[1]['oaf:result']['context'], // 8
|
|
// res[1]['oaf:result']['creator'], // 9
|
|
// res[1]['oaf:result']['country'] , // 10
|
|
// res[1]['oaf:result']['programmingLanguage'], // 11 - software
|
|
// //res[1]['oaf:result']['resulttype'],
|
|
// (res[1]['extraInfo'] !== undefined && res[1]['extraInfo']['citations'] !== undefined)
|
|
// ? res[1]['extraInfo']['citations']['citation'] : null, // 12
|
|
// res[0], // 13
|
|
// res[2] // 14
|
|
// ]))
|
|
// .pipe(map(res => this.parseResultLandingInfo(res, provenanceActionVocabulary, properties)));
|
|
|
|
}
|
|
|
|
return allUrls;
|
|
}
|
|
|
|
|
|
// function parseAllUrls1(response) {
|
|
// let allUrls = [];
|
|
//
|
|
// let responses = response.body['results'];
|
|
// let length = Array.isArray(responses) ? responses.length : 1;
|
|
//
|
|
// for (let i = 0; i < length; i++) {
|
|
// let p = new parsingFunctions.ParsingFunctions();
|
|
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
|
//
|
|
// let type = "result";
|
|
// if (resData['resulttype']) {
|
|
// type = resData['resulttype']['classname'];
|
|
// }
|
|
//
|
|
// if (resData['pid']) {
|
|
// let identifiers = p.parseIdentifiers(resData['pid']);
|
|
// let pid = string_utils.Identifier.getPIDFromIdentifiers(identifiers);
|
|
//
|
|
// if(pid && pid.id) {
|
|
// allUrls[i] = getUrlByType(type, pid, pid.id);
|
|
// } else {
|
|
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
|
// allUrls[i] = getUrlByType(type, null, canId);
|
|
// }
|
|
// } else {
|
|
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
|
// allUrls[i] = getUrlByType(type, null, canId);
|
|
// }
|
|
// }
|
|
// return allUrls;
|
|
// }
|
|
//
|
|
function getUrlByType(type: any, pid: any, id: any) {
|
|
let parameter: any = "";
|
|
|
|
if (type === "publication") {
|
|
publications++;
|
|
parameter = "articleId";
|
|
} else if (type === "dataset") {
|
|
datasets++;
|
|
parameter = "datasetId";
|
|
} else if (type === "software") {
|
|
software++;
|
|
parameter = "softwareId";
|
|
} else if (type === "other") {
|
|
other++;
|
|
parameter = "orpId";
|
|
} else {
|
|
parameter = "id";
|
|
}
|
|
if(pid) {
|
|
urlsWithPid++;
|
|
parameter = "pid";
|
|
}
|
|
return landingPrefix+type+"?"+parameter+"="+id;
|
|
}
|
|
|
|
// comment out for communities query check
|
|
function getCommunities() {
|
|
communitiesPromise = new Promise((resolve, reject) => {
|
|
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
|
if (!communitiesResponse && err) {
|
|
reject(err);
|
|
} else {
|
|
const contextsService = new ContextsService();
|
|
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
|
resolve(value => value);
|
|
}
|
|
})
|
|
}).catch(error => console.error("Error getting communities ", error));
|
|
}
|
|
|
|
function buildSiteMap(resultsPerUrl, resultsType, subject = null) {
|
|
console.time("total_time (" + resultsPerUrl + " " + resultsType + " per request)");
|
|
|
|
let date = new Date();
|
|
createSitemapFile(resultsType, subject);
|
|
errorFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_error_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
|
noIndexFileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_noIndex_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate() + ".txt";//+"_"+date.getTime();
|
|
|
|
if(subject) {
|
|
getForSubject(resultsPerUrl, resultsType, subject);
|
|
} else {
|
|
getCommunities(); // comment out for communities query check
|
|
get(resultsPerUrl, resultsType);
|
|
}
|
|
}
|
|
|
|
async function createSitemapFile(resultsType, subject=null) {
|
|
// let date = new Date();
|
|
fileName = resultsType + (subject ? "_"+subject.replace(/\s/g, "") : "") + "_sitemap" + (filesCreated > 0 ? filesCreated : "")
|
|
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
|
|
+ ".xml";//+"_"+date.getTime();
|
|
filesCreated++;
|
|
|
|
console.log("Buiding sitemap in file: " + fileName + "\n");
|
|
|
|
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">";
|
|
fs.writeFileSync("./" + fileName, sitemap);
|
|
}
|
|
|
|
function askQuestion(query) {
|
|
const readline = require('readline');
|
|
|
|
const rl = readline.createInterface({
|
|
input: process.stdin,
|
|
output: process.stdout,
|
|
});
|
|
|
|
return new Promise(resolve => rl.question(query, ans => {
|
|
rl.close();
|
|
resolve(ans);
|
|
}))
|
|
}
|
|
|
|
async function start(resultsType, subject = null) {
|
|
if(resultsType == null) {
|
|
resultsType = await askQuestion("Please provide type of results (publications, datasets, software, other): ");
|
|
}
|
|
console.log("type is: " + resultsType);
|
|
|
|
if(subject == null) {
|
|
subject = await askQuestion("Please provide subject. " +
|
|
"Available subjects are \"Physics::Atomic Physics\" or physics, \"Mathematics::Combinatorics\" or mathematics, " +
|
|
"any other subject you want or no value if no subject: ");
|
|
if(!subject) {
|
|
subject = null;
|
|
}
|
|
}
|
|
console.log("subject is: " + subject);
|
|
|
|
buildSiteMap(200, resultsType, subject);
|
|
}
|
|
|
|
let filesCreated = 0;
|
|
let fileName;
|
|
let errorFileName;
|
|
let noIndexFileName;
|
|
const fs = require('fs');
|
|
|
|
let promiseArray = [];
|
|
let alreadyin = 0; // duplicate urls
|
|
let notin= 0;
|
|
let finalUrls = 0;
|
|
let noIndexedUrls = 0;
|
|
let urlsWithPid = 0;
|
|
let publications = 0;
|
|
let datasets = 0;
|
|
let software = 0;
|
|
let other = 0;
|
|
|
|
// comment out for communities query check
|
|
let communitiesPromise;
|
|
let publicCommunities = [];
|
|
|
|
|
|
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
|
const landingPrefix = "https://explore.openaire.eu/search/";
|
|
// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
|
|
|
// comment out for communities query check
|
|
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
|
|
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
|
|
|
const subjectMapping = {"physics": "Physics::Atomic Physics", "mathematics": "Mathematics::Combinatorics"}
|
|
|
|
// process.argc[3] is the "resultType" argument, process.argc[4] is the "subject" argument
|
|
start(process.argv[3], process.argv[4]);
|