[Trunk | Explore]: Added sitemaps folder to create sitemap.xml file with all urls from research outcomes search page with one filter value selected foreach filter value.
git-svn-id: https://svn.driver.research-infrastructures.eu/driver/dnet40/modules/uoa-services-portal/trunk@61220 d315682c-612b-4755-9ff5-7f18f6832af3
This commit is contained in:
parent
642bf36981
commit
4d5d3ea0f0
|
@ -0,0 +1,260 @@
|
|||
'use strict';
|
||||
|
||||
import {properties} from "../../explore/src/environments/environment";
|
||||
import {SearchResearchResultsService} from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service";
|
||||
import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview";
|
||||
|
||||
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
||||
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
||||
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
||||
|
||||
let express = require('express');
|
||||
let app = express();
|
||||
const request = require('superagent');
|
||||
const URL = require('url');
|
||||
let cors = require('cors');
|
||||
app.use(cors());
|
||||
|
||||
function get(resultsPerUrl) {
|
||||
setTimeout(() => {
|
||||
let searchFields = new SearchFields();
|
||||
let fieldIdsMap = searchFields.RESULT_FIELDS;
|
||||
|
||||
request.get(refineUrl, async function (err: any, refineResponse: any) {
|
||||
if (!refineResponse && err) {
|
||||
console.error("Error getting refine filters ",err);
|
||||
} else {
|
||||
let keys = refineResponse.body['refineResults'] ? Object.keys(refineResponse.body['refineResults']) : null;
|
||||
console.log("number of keys: " + keys.length);
|
||||
|
||||
let allUrls = new Set();
|
||||
|
||||
var promiseArray = [];
|
||||
|
||||
for (let key of keys) {
|
||||
if(key == "community") {
|
||||
await communitiesPromise;
|
||||
}
|
||||
console.log("key: "+key+", number of values: " + refineResponse.body['refineResults'][key].length);
|
||||
|
||||
for (let value of refineResponse.body['refineResults'][key]) {
|
||||
if(!value || !value.name || !value.id
|
||||
|| value.name.toLowerCase().includes('unknown') || value.name.toLowerCase().includes('not available')
|
||||
|| value.name == "unidentified" || value.name == "Undetermined") {
|
||||
console.log("filtered out: "+(value ? ("name: "+value.name + " - id: "+value.id) : value));
|
||||
continue;
|
||||
}
|
||||
|
||||
if(key=="community") {
|
||||
let valueId = "";
|
||||
if(value.id) {
|
||||
let idArray = value.id.split("||");
|
||||
if(idArray) {
|
||||
valueId = idArray[0];
|
||||
}
|
||||
}
|
||||
if(!valueId || !publicCommunities.includes(valueId)) {
|
||||
console.log("hidden community: "+valueId);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap[key].equalityOperator + " \"" + encodeURIComponent(value.id) + "\"" + "&type=results&page=0&size=" + resultsPerUrl;
|
||||
|
||||
promiseArray.push(new Promise((resolve, reject) => {
|
||||
request.get(url, function (err: any, response: any) {
|
||||
if (!response && err) {
|
||||
console.error("Error getting results ", err);
|
||||
fs.appendFileSync("./"+errorFileName, "no response "+url);
|
||||
fs.appendFileSync("./"+errorFileName, err);
|
||||
fs.appendFileSync("./"+errorFileName, "\n");
|
||||
reject();
|
||||
} else {
|
||||
parseAllUrls(response, allUrls);
|
||||
resolve();
|
||||
}
|
||||
})
|
||||
}));
|
||||
}
|
||||
console.log("");
|
||||
}
|
||||
|
||||
await Promise.all(promiseArray);
|
||||
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
||||
|
||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||
if (err) {
|
||||
return console.log("Error appending in file "+fileName+": ", err);
|
||||
}
|
||||
console.timeEnd("total_time");
|
||||
});
|
||||
}
|
||||
})
|
||||
})
|
||||
}
|
||||
// });
|
||||
|
||||
function parseAllUrls(response: any, allUrls: any) {
|
||||
// let allUrls: any = [];
|
||||
|
||||
let responses: any = response.body['results'];
|
||||
let searchResearchResultsService: any = new SearchResearchResultsService();
|
||||
|
||||
// if(responses) {
|
||||
// let length = Array.isArray(responses) ? responses.length : 1;
|
||||
// for (let i = 0; i < length; i++) {
|
||||
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
||||
//
|
||||
// if (resData['pid']) {
|
||||
// if (!Array.isArray(resData['pid'])) {
|
||||
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
|
||||
// if (resData['pid'].content != '' && resData['pid'].content != null) {
|
||||
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
|
||||
// resData['pid'].content.replace("https://doi.org/", "");
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
|
||||
let searchResults: any = searchResearchResultsService.parseResults("result", responses, properties);
|
||||
if(searchResults.length < 100 && searchResults.length > 0) {
|
||||
console.log("num of results: "+searchResults.length + " " + response.request.url);
|
||||
}
|
||||
|
||||
if(searchResults.length == 0) {
|
||||
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n");
|
||||
}
|
||||
for(let j=0; j<searchResults.length; j++) {
|
||||
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
||||
|
||||
let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
|
||||
let url;
|
||||
if(pid && pid.id) {
|
||||
url = getUrlByType(resultPreview.resultType, pid, pid.id);
|
||||
} else {
|
||||
url = getUrlByType(resultPreview.resultType, null, resultPreview.id);
|
||||
}
|
||||
if(allUrls.has(url)) {
|
||||
alreadyin++;
|
||||
} else {
|
||||
allUrls.add(url);
|
||||
let urlPre = "<url>\n" +
|
||||
" <loc>";
|
||||
let urlSuf = "</loc>\n" +
|
||||
" </url>";
|
||||
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
||||
notin++;
|
||||
}
|
||||
}
|
||||
return allUrls;
|
||||
}
|
||||
|
||||
|
||||
// function parseAllUrls1(response) {
|
||||
// let allUrls = [];
|
||||
//
|
||||
// let responses = response.body['results'];
|
||||
// let length = Array.isArray(responses) ? responses.length : 1;
|
||||
//
|
||||
// for (let i = 0; i < length; i++) {
|
||||
// let p = new parsingFunctions.ParsingFunctions();
|
||||
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
|
||||
//
|
||||
// let type = "result";
|
||||
// if (resData['resulttype']) {
|
||||
// type = resData['resulttype']['classname'];
|
||||
// }
|
||||
//
|
||||
// if (resData['pid']) {
|
||||
// let identifiers = p.parseIdentifiers(resData['pid']);
|
||||
// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
|
||||
//
|
||||
// if(pid && pid.id) {
|
||||
// allUrls[i] = getUrlByType(type, pid, pid.id);
|
||||
// } else {
|
||||
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
||||
// allUrls[i] = getUrlByType(type, null, canId);
|
||||
// }
|
||||
// } else {
|
||||
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
|
||||
// allUrls[i] = getUrlByType(type, null, canId);
|
||||
// }
|
||||
// }
|
||||
// return allUrls;
|
||||
// }
|
||||
//
|
||||
function getUrlByType(type: any, pid: any, id: any) {
|
||||
let parameter: any = "";
|
||||
|
||||
if (type === "publication") {
|
||||
parameter = "articleId";
|
||||
} else if (type === "dataset") {
|
||||
parameter = "datasetId";
|
||||
} else if (type === "software") {
|
||||
parameter = "softwareId";
|
||||
} else if (type === "other") {
|
||||
parameter = "orpId";
|
||||
} else {
|
||||
parameter = "id";
|
||||
}
|
||||
if(pid) {
|
||||
parameter = "pid";
|
||||
}
|
||||
return landingPrefix+type+"?"+parameter+"="+id;
|
||||
}
|
||||
|
||||
function getCommunities() {
|
||||
communitiesPromise = new Promise((resolve, reject) => {
|
||||
resolve();
|
||||
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
||||
if (!communitiesResponse && err) {
|
||||
console.error("Error getting communities ", err);
|
||||
} else {
|
||||
const contextsService = new ContextsService();
|
||||
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
||||
}
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
function buildSiteMap(resultsPerUrl) {
|
||||
console.time("total_time");
|
||||
|
||||
let date = new Date();
|
||||
fileName = "sitemap_"+date.getFullYear()+"_"+date.getMonth()+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
||||
errorFileName = "error_"+date.getFullYear()+"_"+date.getMonth()+"_"+date.getDate();//+"_"+date.getTime();
|
||||
console.log("Buiding sitemap in file: "+fileName+"\n");
|
||||
|
||||
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
||||
"<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">";
|
||||
|
||||
fs.writeFile("./"+fileName, sitemap, function(err) {
|
||||
if(err) {
|
||||
return console.log("Error writing in file "+fileName+": ", err);
|
||||
}
|
||||
});
|
||||
|
||||
getCommunities();
|
||||
get(resultsPerUrl);
|
||||
}
|
||||
|
||||
|
||||
var fileName;
|
||||
var errorFileName;
|
||||
const fs = require('fs');
|
||||
|
||||
var alreadyin = 0; // duplicate urls
|
||||
var notin= 0;
|
||||
|
||||
var communitiesPromise;
|
||||
var publicCommunities = [];
|
||||
|
||||
const refineUrl = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
||||
const resultsUrlPrefix = "https://beta.services.openaire.eu/search/v2/api/resources2/?format=json";
|
||||
const landingPrefix = "https://beta.explore.openaire.eu/search/";
|
||||
const contextUrl = "https://beta.services.openaire.eu/openaire/contexts/";
|
||||
|
||||
buildSiteMap(100);
|
|
@ -0,0 +1,23 @@
|
|||
{
|
||||
"name": "urls_for_sitemap",
|
||||
"version": "1.0.0",
|
||||
"description": "Caching in memory",
|
||||
"main": "cache.js",
|
||||
"scripts": {
|
||||
"start": "PORT=3100 node extractUrlsFromSearch.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"cors": "^2.8.5",
|
||||
"express": "^4.17.1",
|
||||
"superagent": "^5.0.5"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "3.2.4",
|
||||
"@types/node": "^8.0.30"
|
||||
},
|
||||
"engines": {
|
||||
"node": "8.1.x"
|
||||
},
|
||||
"author": "Konstantina Galouni <kgalouni@di.uoa.gr>",
|
||||
"license": "NKUA"
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
npx ts-node extractUrlsFromSearch.ts
|
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"typeRoots": [
|
||||
"node_modules/@types"
|
||||
],
|
||||
"noImplicitAny": false,
|
||||
"emitDecoratorMetadata": true,
|
||||
"experimentalDecorators": true
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue