[Explore Service | Services]:
1. extractUrlsFromSearch.ts: changed results parsing (used landing parsing instead of searh) to call method "checkIfAllowed()" (no index filtering) | Added more numbers for statistics. 2. package.json: Upgrade node version.
This commit is contained in:
parent
9f48768fe3
commit
f5cb4c2acd
|
@ -7,6 +7,9 @@ import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result-
|
||||||
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class";
|
||||||
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields";
|
||||||
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service";
|
||||||
|
import {ResultLandingComponent} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.component";
|
||||||
|
import {ResultLandingService} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.service";
|
||||||
|
import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/entities/resultLandingInfo";
|
||||||
|
|
||||||
const request = require('superagent');
|
const request = require('superagent');
|
||||||
|
|
||||||
|
@ -27,6 +30,7 @@ function get(resultsPerUrl) {
|
||||||
let promiseArray = [];
|
let promiseArray = [];
|
||||||
|
|
||||||
for (let key of keys) {
|
for (let key of keys) {
|
||||||
|
// comment out for communities query check
|
||||||
if(key == "community") {
|
if(key == "community") {
|
||||||
await communitiesPromise;
|
await communitiesPromise;
|
||||||
}
|
}
|
||||||
|
@ -48,7 +52,8 @@ function get(resultsPerUrl) {
|
||||||
valueId = idArray[0];
|
valueId = idArray[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(!valueId || !publicCommunities.includes(valueId)) {
|
// if(!valueId) {
|
||||||
|
if(!valueId || !publicCommunities.includes(valueId)) { // comment out for communities query check
|
||||||
console.log("hidden community: "+valueId);
|
console.log("hidden community: "+valueId);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -64,7 +69,7 @@ function get(resultsPerUrl) {
|
||||||
reject(err);
|
reject(err);
|
||||||
} else {
|
} else {
|
||||||
parseAllUrls(response, allUrls);
|
parseAllUrls(response, allUrls);
|
||||||
resolve();
|
resolve(value => value);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}).catch(error => {
|
}).catch(error => {
|
||||||
|
@ -79,6 +84,9 @@ function get(resultsPerUrl) {
|
||||||
|
|
||||||
await Promise.all(promiseArray);
|
await Promise.all(promiseArray);
|
||||||
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin);
|
||||||
|
console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls);
|
||||||
|
console.log("\nPublications: "+publications + " - Datasets: "+datasets +
|
||||||
|
" - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid);
|
||||||
|
|
||||||
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
fs.appendFile("./" + fileName, "\n</urlset>", function (err) {
|
||||||
if (err) {
|
if (err) {
|
||||||
|
@ -92,7 +100,7 @@ function get(resultsPerUrl) {
|
||||||
}
|
}
|
||||||
// });
|
// });
|
||||||
|
|
||||||
function parseAllUrls(response: any, allUrls: any) {
|
function parseAllUrls_old(response: any, allUrls: any) {
|
||||||
// let allUrls: any = [];
|
// let allUrls: any = [];
|
||||||
|
|
||||||
let responses: any = response.body['results'];
|
let responses: any = response.body['results'];
|
||||||
|
@ -123,7 +131,7 @@ function parseAllUrls(response: any, allUrls: any) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if(searchResults.length == 0) {
|
if(searchResults.length == 0) {
|
||||||
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n");
|
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n");
|
||||||
}
|
}
|
||||||
for(let j=0; j<searchResults.length; j++) {
|
for(let j=0; j<searchResults.length; j++) {
|
||||||
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
let resultPreview: any = ResultPreview.searchResultConvert(searchResults[j], searchResults[j].entityType);
|
||||||
|
@ -149,6 +157,93 @@ function parseAllUrls(response: any, allUrls: any) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
function parseAllUrls(response: any, allUrls: any) {
|
||||||
|
// let allUrls: any = [];
|
||||||
|
|
||||||
|
let responses: any = response.body['results'];
|
||||||
|
let length = responses ? (Array.isArray(responses) ? responses.length : 1) : 0;
|
||||||
|
|
||||||
|
if(length < 100 && length > 0) {
|
||||||
|
console.log("num of results: "+length + " " + response.request.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(length == 0) {
|
||||||
|
fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (let i = 0; i < length; i++) {
|
||||||
|
let curResponse = Array.isArray(responses) ? responses[i] : responses;
|
||||||
|
let resData = curResponse['result']['metadata']['oaf:entity'];
|
||||||
|
|
||||||
|
let param = [resData['oaf:result'], resData['oaf:result']['title'], resData['oaf:result']['rels']['rel'],
|
||||||
|
resData['oaf:result']['children'], resData['oaf:result']['pid'], resData['oaf:result']['journal'],
|
||||||
|
resData['oaf:result']['language'], resData['oaf:result']['subject'], resData['oaf:result']['context'],
|
||||||
|
resData['oaf:result']['creator'], resData['oaf:result']['country'], resData['oaf:result']['programmingLanguage'],
|
||||||
|
(resData['extraInfo'] !== undefined && resData['extraInfo']['citations'] !== undefined)
|
||||||
|
? resData['extraInfo']['citations']['citation'] : null,
|
||||||
|
curResponse['result']['header']['dri:status'], curResponse];
|
||||||
|
|
||||||
|
let resultLandingService = new ResultLandingService(null);
|
||||||
|
let resultLandingInfo: ResultLandingInfo = resultLandingService.parseResultLandingInfo(param, null, properties);
|
||||||
|
|
||||||
|
let resultPreview: any = ResultPreview.resultLandingInfoConvert(resultLandingInfo, resultLandingInfo.resultType);
|
||||||
|
|
||||||
|
let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers);
|
||||||
|
let url;
|
||||||
|
if(pid && pid.id) {
|
||||||
|
url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id));
|
||||||
|
} else {
|
||||||
|
url = getUrlByType(resultPreview.resultType, null, resultLandingInfo.relcanId);
|
||||||
|
}
|
||||||
|
if(allUrls.has(url)) {
|
||||||
|
alreadyin++;
|
||||||
|
} else {
|
||||||
|
let resultLandingComponent = new ResultLandingComponent(null, null, null,
|
||||||
|
null, null, null, null, null, null,
|
||||||
|
null, null, null, null, null);
|
||||||
|
|
||||||
|
if(!resultLandingComponent.checkIfAllowed(resultLandingInfo)) {
|
||||||
|
noIndexedUrls++;
|
||||||
|
fs.appendFileSync("./"+noIndexFileName, url+"\n");
|
||||||
|
} else {
|
||||||
|
finalUrls++;
|
||||||
|
let urlPre = "\n<url><loc>";
|
||||||
|
let urlSuf = "</loc></url>";
|
||||||
|
fs.appendFileSync("./"+fileName, urlPre + url + urlSuf);
|
||||||
|
}
|
||||||
|
|
||||||
|
allUrls.add(url);
|
||||||
|
notin++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// .pipe(map(res => [res['result']['header']['dri:status'], res['result']['metadata']['oaf:entity'], res]))
|
||||||
|
// .pipe(map(res => [
|
||||||
|
// res[1]['oaf:result'], // 0
|
||||||
|
// res[1]['oaf:result']['title'], // 1
|
||||||
|
// res[1]['oaf:result']['rels']['rel'], // 2
|
||||||
|
// res[1]['oaf:result']['children'], // 3
|
||||||
|
// res[1]['oaf:result']['pid'], // 4
|
||||||
|
// res[1]['oaf:result']['journal'], // 5
|
||||||
|
// res[1]['oaf:result']['language'], // 6
|
||||||
|
// res[1]['oaf:result']['subject'], // 7
|
||||||
|
// res[1]['oaf:result']['context'], // 8
|
||||||
|
// res[1]['oaf:result']['creator'], // 9
|
||||||
|
// res[1]['oaf:result']['country'] , // 10
|
||||||
|
// res[1]['oaf:result']['programmingLanguage'], // 11 - software
|
||||||
|
// //res[1]['oaf:result']['resulttype'],
|
||||||
|
// (res[1]['extraInfo'] !== undefined && res[1]['extraInfo']['citations'] !== undefined)
|
||||||
|
// ? res[1]['extraInfo']['citations']['citation'] : null, // 12
|
||||||
|
// res[0], // 13
|
||||||
|
// res[2] // 14
|
||||||
|
// ]))
|
||||||
|
// .pipe(map(res => this.parseResultLandingInfo(res, provenanceActionVocabulary, properties)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return allUrls;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// function parseAllUrls1(response) {
|
// function parseAllUrls1(response) {
|
||||||
// let allUrls = [];
|
// let allUrls = [];
|
||||||
//
|
//
|
||||||
|
@ -186,22 +281,28 @@ function getUrlByType(type: any, pid: any, id: any) {
|
||||||
let parameter: any = "";
|
let parameter: any = "";
|
||||||
|
|
||||||
if (type === "publication") {
|
if (type === "publication") {
|
||||||
|
publications++;
|
||||||
parameter = "articleId";
|
parameter = "articleId";
|
||||||
} else if (type === "dataset") {
|
} else if (type === "dataset") {
|
||||||
|
datasets++;
|
||||||
parameter = "datasetId";
|
parameter = "datasetId";
|
||||||
} else if (type === "software") {
|
} else if (type === "software") {
|
||||||
|
software++;
|
||||||
parameter = "softwareId";
|
parameter = "softwareId";
|
||||||
} else if (type === "other") {
|
} else if (type === "other") {
|
||||||
|
other++;
|
||||||
parameter = "orpId";
|
parameter = "orpId";
|
||||||
} else {
|
} else {
|
||||||
parameter = "id";
|
parameter = "id";
|
||||||
}
|
}
|
||||||
if(pid) {
|
if(pid) {
|
||||||
|
urlsWithPid++;
|
||||||
parameter = "pid";
|
parameter = "pid";
|
||||||
}
|
}
|
||||||
return landingPrefix+type+"?"+parameter+"="+id;
|
return landingPrefix+type+"?"+parameter+"="+id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// comment out for communities query check
|
||||||
function getCommunities() {
|
function getCommunities() {
|
||||||
communitiesPromise = new Promise((resolve, reject) => {
|
communitiesPromise = new Promise((resolve, reject) => {
|
||||||
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
request.get(contextUrl, async function (err: any, communitiesResponse: any) {
|
||||||
|
@ -210,7 +311,7 @@ function getCommunities() {
|
||||||
} else {
|
} else {
|
||||||
const contextsService = new ContextsService();
|
const contextsService = new ContextsService();
|
||||||
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id);
|
||||||
resolve();
|
resolve(value => value);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}).catch(error => console.error("Error getting communities ", error));
|
}).catch(error => console.error("Error getting communities ", error));
|
||||||
|
@ -222,6 +323,7 @@ function buildSiteMap(resultsPerUrl) {
|
||||||
let date = new Date();
|
let date = new Date();
|
||||||
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime();
|
||||||
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
||||||
|
noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime();
|
||||||
console.log("Buiding sitemap in file: "+fileName+"\n");
|
console.log("Buiding sitemap in file: "+fileName+"\n");
|
||||||
|
|
||||||
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
|
||||||
|
@ -233,24 +335,37 @@ function buildSiteMap(resultsPerUrl) {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
getCommunities();
|
getCommunities(); // comment out for communities query check
|
||||||
get(resultsPerUrl);
|
get(resultsPerUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
let fileName;
|
let fileName;
|
||||||
let errorFileName;
|
let errorFileName;
|
||||||
|
let noIndexFileName;
|
||||||
const fs = require('fs');
|
const fs = require('fs');
|
||||||
|
|
||||||
let alreadyin = 0; // duplicate urls
|
let alreadyin = 0; // duplicate urls
|
||||||
let notin= 0;
|
let notin= 0;
|
||||||
|
let finalUrls = 0;
|
||||||
|
let noIndexedUrls = 0;
|
||||||
|
let publications = 0;
|
||||||
|
let datasets = 0;
|
||||||
|
let software = 0;
|
||||||
|
let other = 0;
|
||||||
|
let urlsWithPid = 0;
|
||||||
|
|
||||||
|
// comment out for communities query check
|
||||||
let communitiesPromise;
|
let communitiesPromise;
|
||||||
let publicCommunities = [];
|
let publicCommunities = [];
|
||||||
|
|
||||||
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
|
||||||
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json";
|
||||||
const landingPrefix = "https://explore.openaire.eu/search/";
|
const landingPrefix = "https://explore.openaire.eu/search/";
|
||||||
|
// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
||||||
|
|
||||||
|
// comment out for communities query check
|
||||||
|
const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0";
|
||||||
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
const contextUrl = "https://services.openaire.eu/openaire/contexts/";
|
||||||
|
|
||||||
buildSiteMap(100);
|
buildSiteMap(150);
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
"@types/node": "^8.0.30"
|
"@types/node": "^8.0.30"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": "8.1.x"
|
"node": "16.3.0"
|
||||||
},
|
},
|
||||||
"author": "Konstantina Galouni <kgalouni@di.uoa.gr>",
|
"author": "Konstantina Galouni <kgalouni@di.uoa.gr>",
|
||||||
"license": "NKUA"
|
"license": "NKUA"
|
||||||
|
|
Loading…
Reference in New Issue