From 9f48768fe300e2e7fd8a3e9ba222d905365ecf3a Mon Sep 17 00:00:00 2001 From: "konstantina.galouni" Date: Fri, 6 Aug 2021 14:10:42 +0300 Subject: [PATCH 1/2] [Explore Service]: resultLanding.component.ts: added "checkIfAllowed()" method to be called by sitemap script and by updated "addNoIndexFilter()" method | Added spam words in "title_authors_words" | Added "publicCommunities" array with public communities in production - no index results from these communities. --- explore/src/app/openaireLibrary | 2 +- explore/src/assets/common-assets | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/explore/src/app/openaireLibrary b/explore/src/app/openaireLibrary index 58b44c7b..b4b3e143 160000 --- a/explore/src/app/openaireLibrary +++ b/explore/src/app/openaireLibrary @@ -1 +1 @@ -Subproject commit 58b44c7bb0bfe8928dd43c12dc1560a5ea3e121a +Subproject commit b4b3e143bbb4ca2b33eadb6020d4ea016f408bfa diff --git a/explore/src/assets/common-assets b/explore/src/assets/common-assets index 2f572df5..ffc664c3 160000 --- a/explore/src/assets/common-assets +++ b/explore/src/assets/common-assets @@ -1 +1 @@ -Subproject commit 2f572df5783cb32cc84969b75939e45ffce3b424 +Subproject commit ffc664c3672226fb16eacb4886a3f328f5bf1d33 From f5cb4c2acdc08e3ae6ca1e28053d25d50e757d54 Mon Sep 17 00:00:00 2001 From: "konstantina.galouni" Date: Fri, 6 Aug 2021 14:19:58 +0300 Subject: [PATCH 2/2] [Explore Service | Services]: 1. extractUrlsFromSearch.ts: changed results parsing (used landing parsing instead of searh) to call method "checkIfAllowed()" (no index filtering) | Added more numbers for statistics. 2. package.json: Upgrade node version. --- services/sitemaps/extractUrlsFromSearch.ts | 131 +++++++++++++++++++-- services/sitemaps/package.json | 2 +- 2 files changed, 124 insertions(+), 9 deletions(-) diff --git a/services/sitemaps/extractUrlsFromSearch.ts b/services/sitemaps/extractUrlsFromSearch.ts index 00ddb20b..aebba9ab 100644 --- a/services/sitemaps/extractUrlsFromSearch.ts +++ b/services/sitemaps/extractUrlsFromSearch.ts @@ -7,6 +7,9 @@ import {ResultPreview} from "../../explore/src/app/openaireLibrary/utils/result- import {Identifier} from "../../explore/src/app/openaireLibrary/utils/string-utils.class"; import {SearchFields} from "../../explore/src/app/openaireLibrary/utils/properties/searchFields"; import {ContextsService} from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service"; +import {ResultLandingComponent} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.component"; +import {ResultLandingService} from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.service"; +import {ResultLandingInfo} from "../../explore/src/app/openaireLibrary/utils/entities/resultLandingInfo"; const request = require('superagent'); @@ -27,6 +30,7 @@ function get(resultsPerUrl) { let promiseArray = []; for (let key of keys) { + // comment out for communities query check if(key == "community") { await communitiesPromise; } @@ -48,7 +52,8 @@ function get(resultsPerUrl) { valueId = idArray[0]; } } - if(!valueId || !publicCommunities.includes(valueId)) { + // if(!valueId) { + if(!valueId || !publicCommunities.includes(valueId)) { // comment out for communities query check console.log("hidden community: "+valueId); continue; } @@ -64,7 +69,7 @@ function get(resultsPerUrl) { reject(err); } else { parseAllUrls(response, allUrls); - resolve(); + resolve(value => value); } }) }).catch(error => { @@ -79,6 +84,9 @@ function get(resultsPerUrl) { await Promise.all(promiseArray); console.log("\nDuplicate urls: "+alreadyin + " vs unique urls: "+notin); + console.log("\nNo indexed urls: "+noIndexedUrls + " vs final urls: "+finalUrls); + console.log("\nPublications: "+publications + " - Datasets: "+datasets + + " - Software: "+software + " - Other: "+other + " --- urls with pid: "+urlsWithPid); fs.appendFile("./" + fileName, "\n", function (err) { if (err) { @@ -92,7 +100,7 @@ function get(resultsPerUrl) { } // }); -function parseAllUrls(response: any, allUrls: any) { +function parseAllUrls_old(response: any, allUrls: any) { // let allUrls: any = []; let responses: any = response.body['results']; @@ -123,7 +131,7 @@ function parseAllUrls(response: any, allUrls: any) { } if(searchResults.length == 0) { - fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"/n"); + fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n"); } for(let j=0; j 0) { + console.log("num of results: "+length + " " + response.request.url); + } + + if(length == 0) { + fs.appendFileSync("./"+errorFileName, response.statusCode+" "+response.request.url+"\n"); + } + + for (let i = 0; i < length; i++) { + let curResponse = Array.isArray(responses) ? responses[i] : responses; + let resData = curResponse['result']['metadata']['oaf:entity']; + + let param = [resData['oaf:result'], resData['oaf:result']['title'], resData['oaf:result']['rels']['rel'], + resData['oaf:result']['children'], resData['oaf:result']['pid'], resData['oaf:result']['journal'], + resData['oaf:result']['language'], resData['oaf:result']['subject'], resData['oaf:result']['context'], + resData['oaf:result']['creator'], resData['oaf:result']['country'], resData['oaf:result']['programmingLanguage'], + (resData['extraInfo'] !== undefined && resData['extraInfo']['citations'] !== undefined) + ? resData['extraInfo']['citations']['citation'] : null, + curResponse['result']['header']['dri:status'], curResponse]; + + let resultLandingService = new ResultLandingService(null); + let resultLandingInfo: ResultLandingInfo = resultLandingService.parseResultLandingInfo(param, null, properties); + + let resultPreview: any = ResultPreview.resultLandingInfoConvert(resultLandingInfo, resultLandingInfo.resultType); + + let pid: any = Identifier.getResultPIDFromIdentifiers(resultPreview.identifiers); + let url; + if(pid && pid.id) { + url = getUrlByType(resultPreview.resultType, pid, encodeURIComponent(pid.id)); + } else { + url = getUrlByType(resultPreview.resultType, null, resultLandingInfo.relcanId); + } + if(allUrls.has(url)) { + alreadyin++; + } else { + let resultLandingComponent = new ResultLandingComponent(null, null, null, + null, null, null, null, null, null, + null, null, null, null, null); + + if(!resultLandingComponent.checkIfAllowed(resultLandingInfo)) { + noIndexedUrls++; + fs.appendFileSync("./"+noIndexFileName, url+"\n"); + } else { + finalUrls++; + let urlPre = "\n"; + let urlSuf = ""; + fs.appendFileSync("./"+fileName, urlPre + url + urlSuf); + } + + allUrls.add(url); + notin++; + } + + // .pipe(map(res => [res['result']['header']['dri:status'], res['result']['metadata']['oaf:entity'], res])) + // .pipe(map(res => [ + // res[1]['oaf:result'], // 0 + // res[1]['oaf:result']['title'], // 1 + // res[1]['oaf:result']['rels']['rel'], // 2 + // res[1]['oaf:result']['children'], // 3 + // res[1]['oaf:result']['pid'], // 4 + // res[1]['oaf:result']['journal'], // 5 + // res[1]['oaf:result']['language'], // 6 + // res[1]['oaf:result']['subject'], // 7 + // res[1]['oaf:result']['context'], // 8 + // res[1]['oaf:result']['creator'], // 9 + // res[1]['oaf:result']['country'] , // 10 + // res[1]['oaf:result']['programmingLanguage'], // 11 - software + // //res[1]['oaf:result']['resulttype'], + // (res[1]['extraInfo'] !== undefined && res[1]['extraInfo']['citations'] !== undefined) + // ? res[1]['extraInfo']['citations']['citation'] : null, // 12 + // res[0], // 13 + // res[2] // 14 + // ])) + // .pipe(map(res => this.parseResultLandingInfo(res, provenanceActionVocabulary, properties))); + + } + + return allUrls; +} + + // function parseAllUrls1(response) { // let allUrls = []; // @@ -186,22 +281,28 @@ function getUrlByType(type: any, pid: any, id: any) { let parameter: any = ""; if (type === "publication") { + publications++; parameter = "articleId"; } else if (type === "dataset") { + datasets++; parameter = "datasetId"; } else if (type === "software") { + software++; parameter = "softwareId"; } else if (type === "other") { + other++; parameter = "orpId"; } else { parameter = "id"; } if(pid) { + urlsWithPid++; parameter = "pid"; } return landingPrefix+type+"?"+parameter+"="+id; } +// comment out for communities query check function getCommunities() { communitiesPromise = new Promise((resolve, reject) => { request.get(contextUrl, async function (err: any, communitiesResponse: any) { @@ -210,7 +311,7 @@ function getCommunities() { } else { const contextsService = new ContextsService(); publicCommunities = contextsService.parseCommunities(communitiesResponse.body, false).map(value => value.id); - resolve(); + resolve(value => value); } }) }).catch(error => console.error("Error getting communities ", error)); @@ -222,6 +323,7 @@ function buildSiteMap(resultsPerUrl) { let date = new Date(); fileName = "sitemap_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".xml";//+"_"+date.getTime(); errorFileName = "error_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); + noIndexFileName = "noIndex_"+date.getFullYear()+"_"+(date.getMonth()+1)+"_"+date.getDate()+".txt";//+"_"+date.getTime(); console.log("Buiding sitemap in file: "+fileName+"\n"); let sitemap = "\n" + @@ -233,24 +335,37 @@ function buildSiteMap(resultsPerUrl) { } }); - getCommunities(); + getCommunities(); // comment out for communities query check get(resultsPerUrl); } let fileName; let errorFileName; +let noIndexFileName; const fs = require('fs'); let alreadyin = 0; // duplicate urls let notin= 0; +let finalUrls = 0; +let noIndexedUrls = 0; +let publications = 0; +let datasets = 0; +let software = 0; +let other = 0; +let urlsWithPid = 0; +// comment out for communities query check let communitiesPromise; let publicCommunities = []; -const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; + const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json"; const landingPrefix = "https://explore.openaire.eu/search/"; +// const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; + +// comment out for communities query check +const refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&type=results&page=0&size=0"; const contextUrl = "https://services.openaire.eu/openaire/contexts/"; -buildSiteMap(100); +buildSiteMap(150); diff --git a/services/sitemaps/package.json b/services/sitemaps/package.json index 7ca9a467..1d67b438 100644 --- a/services/sitemaps/package.json +++ b/services/sitemaps/package.json @@ -14,7 +14,7 @@ "@types/node": "^8.0.30" }, "engines": { - "node": "8.1.x" + "node": "16.3.0" }, "author": "Konstantina Galouni ", "license": "NKUA"