2021-06-15 15:19:37 +02:00
'use strict' ;
import { properties } from "../../explore/src/environments/environment" ;
import { SearchResearchResultsService } from "../../explore/src/app/openaireLibrary/services/searchResearchResults.service" ;
import { ResultPreview } from "../../explore/src/app/openaireLibrary/utils/result-preview/result-preview" ;
import { Identifier } from "../../explore/src/app/openaireLibrary/utils/string-utils.class" ;
import { SearchFields } from "../../explore/src/app/openaireLibrary/utils/properties/searchFields" ;
import { ContextsService } from "../../explore/src/app/openaireLibrary/claims/claim-utils/service/contexts.service" ;
2021-08-06 13:19:58 +02:00
import { ResultLandingComponent } from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.component" ;
import { ResultLandingService } from "../../explore/src/app/openaireLibrary/landingPages/result/resultLanding.service" ;
import { ResultLandingInfo } from "../../explore/src/app/openaireLibrary/utils/entities/resultLandingInfo" ;
2021-06-15 15:19:37 +02:00
const request = require ( 'superagent' ) ;
2021-08-10 11:57:25 +02:00
function get ( resultsPerUrl , resultsType ) {
2021-06-15 15:19:37 +02:00
setTimeout ( ( ) = > {
let searchFields = new SearchFields ( ) ;
let fieldIdsMap = searchFields . RESULT_FIELDS ;
2021-08-10 11:57:25 +02:00
refineUrl += resultsType ;
2021-06-15 15:19:37 +02:00
request . get ( refineUrl , async function ( err : any , refineResponse : any ) {
if ( ! refineResponse && err ) {
console . error ( "Error getting refine filters " , err ) ;
} else {
let keys = refineResponse . body [ 'refineResults' ] ? Object . keys ( refineResponse . body [ 'refineResults' ] ) : null ;
2021-10-08 09:33:03 +02:00
console . log ( "number of keys: " + ( keys ? keys . length : 'error: no refine results returned' ) ) ;
2021-06-15 15:19:37 +02:00
let allUrls = new Set ( ) ;
for ( let key of keys ) {
2021-08-06 13:19:58 +02:00
// comment out for communities query check
2021-06-15 15:19:37 +02:00
if ( key == "community" ) {
await communitiesPromise ;
}
console . log ( "key: " + key + ", number of values: " + refineResponse . body [ 'refineResults' ] [ key ] . length ) ;
for ( let value of refineResponse . body [ 'refineResults' ] [ key ] ) {
if ( ! value || ! value . name || ! value . id
|| value . name . toLowerCase ( ) . includes ( 'unknown' ) || value . name . toLowerCase ( ) . includes ( 'not available' )
|| value . name == "unidentified" || value . name == "Undetermined" ) {
console . log ( "filtered out: " + ( value ? ( "name: " + value . name + " - id: " + value . id ) : value ) ) ;
continue ;
}
if ( key == "community" ) {
let valueId = "" ;
if ( value . id ) {
let idArray = value . id . split ( "||" ) ;
if ( idArray ) {
valueId = idArray [ 0 ] ;
}
}
2021-08-06 13:19:58 +02:00
// if(!valueId) {
if ( ! valueId || ! publicCommunities . includes ( valueId ) ) { // comment out for communities query check
2021-06-15 15:19:37 +02:00
console . log ( "hidden community: " + valueId ) ;
continue ;
}
}
2021-08-10 11:57:25 +02:00
const url = resultsUrlPrefix + "&fq=" + key + " " + fieldIdsMap [ key ] . equalityOperator + " \"" + encodeURIComponent ( value . id ) + "\"" + "&type=" + resultsType + "&page=0&size=" + resultsPerUrl ;
2021-06-15 15:19:37 +02:00
2021-07-12 16:09:55 +02:00
await new Promise ( resolve = > setTimeout ( resolve , 500 ) ) ;
promiseArray . push (
new Promise ( ( resolve , reject ) = > {
2021-06-15 15:19:37 +02:00
request . get ( url , function ( err : any , response : any ) {
if ( ! response && err ) {
2021-06-16 10:03:07 +02:00
reject ( err ) ;
2021-06-15 15:19:37 +02:00
} else {
2021-08-10 11:57:25 +02:00
parseAllUrls ( response , allUrls , resultsType ) ;
2021-08-06 13:19:58 +02:00
resolve ( value = > value ) ;
2021-06-15 15:19:37 +02:00
}
} )
2021-06-16 10:03:07 +02:00
} ) . catch ( error = > {
console . error ( "Error getting results " , error ) ;
2021-07-12 16:09:55 +02:00
fs . appendFileSync ( "./" + errorFileName , "no response " + url + " " ) ;
fs . appendFileSync ( "./" + errorFileName , error ) ;
2021-06-16 10:03:07 +02:00
fs . appendFileSync ( "./" + errorFileName , "\n" ) ;
2021-06-15 15:19:37 +02:00
} ) ) ;
}
console . log ( "" ) ;
}
await Promise . all ( promiseArray ) ;
console . log ( "\nDuplicate urls: " + alreadyin + " vs unique urls: " + notin ) ;
2021-08-06 13:19:58 +02:00
console . log ( "\nNo indexed urls: " + noIndexedUrls + " vs final urls: " + finalUrls ) ;
console . log ( "\nPublications: " + publications + " - Datasets: " + datasets +
" - Software: " + software + " - Other: " + other + " --- urls with pid: " + urlsWithPid ) ;
2021-06-15 15:19:37 +02:00
fs . appendFile ( "./" + fileName , "\n</urlset>" , function ( err ) {
if ( err ) {
return console . log ( "Error appending in file " + fileName + ": " , err ) ;
}
2021-08-10 11:57:25 +02:00
console . timeEnd ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
2021-06-15 15:19:37 +02:00
} ) ;
}
} )
} )
}
// });
2021-08-06 13:19:58 +02:00
function parseAllUrls_old ( response : any , allUrls : any ) {
2021-06-15 15:19:37 +02:00
// let allUrls: any = [];
let responses : any = response . body [ 'results' ] ;
let searchResearchResultsService : any = new SearchResearchResultsService ( ) ;
// if(responses) {
// let length = Array.isArray(responses) ? responses.length : 1;
// for (let i = 0; i < length; i++) {
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
//
// if (resData['pid']) {
// if (!Array.isArray(resData['pid'])) {
// if (resData['pid'].classid && resData['pid'].classid == 'doi') {
// if (resData['pid'].content != '' && resData['pid'].content != null) {
// console.log("|"+resData['pid'].content+"| "+(typeof resData['pid'].content));
// resData['pid'].content.replace("https://doi.org/", "");
// }
// }
// }
// }
// }
// }
let searchResults : any = searchResearchResultsService . parseResults ( "result" , responses , properties ) ;
if ( searchResults . length < 100 && searchResults . length > 0 ) {
console . log ( "num of results: " + searchResults . length + " " + response . request . url ) ;
}
if ( searchResults . length == 0 ) {
2021-08-06 13:19:58 +02:00
fs . appendFileSync ( "./" + errorFileName , response . statusCode + " " + response . request . url + "\n" ) ;
2021-06-15 15:19:37 +02:00
}
for ( let j = 0 ; j < searchResults.length ; j + + ) {
let resultPreview : any = ResultPreview . searchResultConvert ( searchResults [ j ] , searchResults [ j ] . entityType ) ;
let pid : any = Identifier . getResultPIDFromIdentifiers ( resultPreview . identifiers ) ;
let url ;
if ( pid && pid . id ) {
2021-07-12 16:09:55 +02:00
url = getUrlByType ( resultPreview . resultType , pid , encodeURIComponent ( pid . id ) ) ;
2021-06-15 15:19:37 +02:00
} else {
url = getUrlByType ( resultPreview . resultType , null , resultPreview . id ) ;
}
if ( allUrls . has ( url ) ) {
alreadyin ++ ;
} else {
allUrls . add ( url ) ;
2021-07-12 16:09:55 +02:00
let urlPre = "\n<url><loc>" ;
let urlSuf = "</loc></url>" ;
2021-06-15 15:19:37 +02:00
fs . appendFileSync ( "./" + fileName , urlPre + url + urlSuf ) ;
notin ++ ;
}
}
return allUrls ;
}
2021-08-10 11:57:25 +02:00
function parseAllUrls ( response : any , allUrls : any , resultsType : string ) {
2021-08-06 13:19:58 +02:00
// let allUrls: any = [];
let responses : any = response . body [ 'results' ] ;
let length = responses ? ( Array . isArray ( responses ) ? responses.length : 1 ) : 0 ;
if ( length < 100 && length > 0 ) {
console . log ( "num of results: " + length + " " + response . request . url ) ;
}
if ( length == 0 ) {
fs . appendFileSync ( "./" + errorFileName , response . statusCode + " " + response . request . url + "\n" ) ;
}
for ( let i = 0 ; i < length ; i ++ ) {
let curResponse = Array . isArray ( responses ) ? responses [ i ] : responses ;
let resData = curResponse [ 'result' ] [ 'metadata' ] [ 'oaf:entity' ] ;
let param = [ resData [ 'oaf:result' ] , resData [ 'oaf:result' ] [ 'title' ] , resData [ 'oaf:result' ] [ 'rels' ] [ 'rel' ] ,
resData [ 'oaf:result' ] [ 'children' ] , resData [ 'oaf:result' ] [ 'pid' ] , resData [ 'oaf:result' ] [ 'journal' ] ,
resData [ 'oaf:result' ] [ 'language' ] , resData [ 'oaf:result' ] [ 'subject' ] , resData [ 'oaf:result' ] [ 'context' ] ,
resData [ 'oaf:result' ] [ 'creator' ] , resData [ 'oaf:result' ] [ 'country' ] , resData [ 'oaf:result' ] [ 'programmingLanguage' ] ,
( resData [ 'extraInfo' ] !== undefined && resData [ 'extraInfo' ] [ 'citations' ] !== undefined )
? resData [ 'extraInfo' ] [ 'citations' ] [ 'citation' ] : null ,
curResponse [ 'result' ] [ 'header' ] [ 'dri:status' ] , curResponse ] ;
let resultLandingService = new ResultLandingService ( null ) ;
let resultLandingInfo : ResultLandingInfo = resultLandingService . parseResultLandingInfo ( param , null , properties ) ;
let resultPreview : any = ResultPreview . resultLandingInfoConvert ( resultLandingInfo , resultLandingInfo . resultType ) ;
let pid : any = Identifier . getResultPIDFromIdentifiers ( resultPreview . identifiers ) ;
let url ;
if ( pid && pid . id ) {
url = getUrlByType ( resultPreview . resultType , pid , encodeURIComponent ( pid . id ) ) ;
} else {
url = getUrlByType ( resultPreview . resultType , null , resultLandingInfo . relcanId ) ;
}
if ( allUrls . has ( url ) ) {
alreadyin ++ ;
} else {
let resultLandingComponent = new ResultLandingComponent ( null , null , null ,
null , null , null , null , null , null ,
null , null , null , null , null ) ;
if ( ! resultLandingComponent . checkIfAllowed ( resultLandingInfo ) ) {
noIndexedUrls ++ ;
fs . appendFileSync ( "./" + noIndexFileName , url + "\n" ) ;
} else {
2021-08-10 11:57:25 +02:00
if ( finalUrls > 0 && ( ( finalUrls % 50000 ) == 0 ) ) {
console . log ( "url to be added in file: " + url ) ;
fs . appendFileSync ( "./" + fileName , "\n</urlset>" ) ;
console . log ( "\n" ) ;
createSitemapFile ( resultsType ) ;
}
2021-08-06 13:19:58 +02:00
finalUrls ++ ;
let urlPre = "\n<url><loc>" ;
let urlSuf = "</loc></url>" ;
fs . appendFileSync ( "./" + fileName , urlPre + url + urlSuf ) ;
}
allUrls . add ( url ) ;
notin ++ ;
}
// .pipe(map(res => [res['result']['header']['dri:status'], res['result']['metadata']['oaf:entity'], res]))
// .pipe(map(res => [
// res[1]['oaf:result'], // 0
// res[1]['oaf:result']['title'], // 1
// res[1]['oaf:result']['rels']['rel'], // 2
// res[1]['oaf:result']['children'], // 3
// res[1]['oaf:result']['pid'], // 4
// res[1]['oaf:result']['journal'], // 5
// res[1]['oaf:result']['language'], // 6
// res[1]['oaf:result']['subject'], // 7
// res[1]['oaf:result']['context'], // 8
// res[1]['oaf:result']['creator'], // 9
// res[1]['oaf:result']['country'] , // 10
// res[1]['oaf:result']['programmingLanguage'], // 11 - software
// //res[1]['oaf:result']['resulttype'],
// (res[1]['extraInfo'] !== undefined && res[1]['extraInfo']['citations'] !== undefined)
// ? res[1]['extraInfo']['citations']['citation'] : null, // 12
// res[0], // 13
// res[2] // 14
// ]))
// .pipe(map(res => this.parseResultLandingInfo(res, provenanceActionVocabulary, properties)));
}
return allUrls ;
}
2021-06-15 15:19:37 +02:00
// function parseAllUrls1(response) {
// let allUrls = [];
//
// let responses = response.body['results'];
// let length = Array.isArray(responses) ? responses.length : 1;
//
// for (let i = 0; i < length; i++) {
// let p = new parsingFunctions.ParsingFunctions();
// let resData = Array.isArray(responses) ? responses[i]['result']['metadata']['oaf:entity']['oaf:result'] : responses['result']['metadata']['oaf:entity']['oaf:result'];
//
// let type = "result";
// if (resData['resulttype']) {
// type = resData['resulttype']['classname'];
// }
//
// if (resData['pid']) {
// let identifiers = p.parseIdentifiers(resData['pid']);
// let pid = string_utils.Identifier.getResultPIDFromIdentifiers(identifiers);
//
// if(pid && pid.id) {
// allUrls[i] = getUrlByType(type, pid, pid.id);
// } else {
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
// allUrls[i] = getUrlByType(type, null, canId);
// }
// } else {
// let canId = parsingFunctions.ParsingFunctions.parseRelCanonicalId(Array.isArray(responses) ? responses[i] : responses, "result");
// allUrls[i] = getUrlByType(type, null, canId);
// }
// }
// return allUrls;
// }
//
function getUrlByType ( type : any , pid : any , id : any ) {
let parameter : any = "" ;
if ( type === "publication" ) {
2021-08-06 13:19:58 +02:00
publications ++ ;
2021-06-15 15:19:37 +02:00
parameter = "articleId" ;
} else if ( type === "dataset" ) {
2021-08-06 13:19:58 +02:00
datasets ++ ;
2021-06-15 15:19:37 +02:00
parameter = "datasetId" ;
} else if ( type === "software" ) {
2021-08-06 13:19:58 +02:00
software ++ ;
2021-06-15 15:19:37 +02:00
parameter = "softwareId" ;
} else if ( type === "other" ) {
2021-08-06 13:19:58 +02:00
other ++ ;
2021-06-15 15:19:37 +02:00
parameter = "orpId" ;
} else {
parameter = "id" ;
}
if ( pid ) {
2021-08-06 13:19:58 +02:00
urlsWithPid ++ ;
2021-06-15 15:19:37 +02:00
parameter = "pid" ;
}
return landingPrefix + type + "?" + parameter + "=" + id ;
}
2021-08-06 13:19:58 +02:00
// comment out for communities query check
2021-06-15 15:19:37 +02:00
function getCommunities() {
communitiesPromise = new Promise ( ( resolve , reject ) = > {
request . get ( contextUrl , async function ( err : any , communitiesResponse : any ) {
if ( ! communitiesResponse && err ) {
2021-06-16 10:03:07 +02:00
reject ( err ) ;
2021-06-15 15:19:37 +02:00
} else {
const contextsService = new ContextsService ( ) ;
publicCommunities = contextsService . parseCommunities ( communitiesResponse . body , false ) . map ( value = > value . id ) ;
2021-08-06 13:19:58 +02:00
resolve ( value = > value ) ;
2021-06-15 15:19:37 +02:00
}
} )
2021-06-16 10:03:07 +02:00
} ) . catch ( error = > console . error ( "Error getting communities " , error ) ) ;
2021-06-15 15:19:37 +02:00
}
2021-08-10 11:57:25 +02:00
function buildSiteMap ( resultsPerUrl , resultsType ) {
console . time ( "total_time (" + resultsPerUrl + " " + resultsType + " per request)" ) ;
2021-06-15 15:19:37 +02:00
let date = new Date ( ) ;
2021-08-10 11:57:25 +02:00
createSitemapFile ( resultsType ) ;
errorFileName = resultsType + "_error_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
noIndexFileName = resultsType + "_noIndex_" + date . getFullYear ( ) + "_" + ( date . getMonth ( ) + 1 ) + "_" + date . getDate ( ) + ".txt" ; //+"_"+date.getTime();
2021-06-15 15:19:37 +02:00
2021-08-06 13:19:58 +02:00
getCommunities ( ) ; // comment out for communities query check
2021-08-10 11:57:25 +02:00
get ( resultsPerUrl , resultsType ) ;
2021-06-15 15:19:37 +02:00
}
2021-08-10 11:57:25 +02:00
async function createSitemapFile ( resultsType ) {
// let date = new Date();
fileName = resultsType + "_sitemap" + ( filesCreated > 0 ? filesCreated : "" )
// + "_" + date.getFullYear() + "_" + (date.getMonth() + 1) + "_" + date.getDate()
+ ".xml" ; //+"_"+date.getTime();
filesCreated ++ ;
console . log ( "Buiding sitemap in file: " + fileName + "\n" ) ;
let sitemap = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" ;
fs . writeFileSync ( "./" + fileName , sitemap ) ;
}
2021-06-15 15:19:37 +02:00
2021-08-10 11:57:25 +02:00
let filesCreated = 0 ;
2021-06-16 10:03:07 +02:00
let fileName ;
let errorFileName ;
2021-08-06 13:19:58 +02:00
let noIndexFileName ;
2021-06-15 15:19:37 +02:00
const fs = require ( 'fs' ) ;
2021-08-10 11:57:25 +02:00
let promiseArray = [ ] ;
2021-06-16 10:03:07 +02:00
let alreadyin = 0 ; // duplicate urls
let notin = 0 ;
2021-08-06 13:19:58 +02:00
let finalUrls = 0 ;
let noIndexedUrls = 0 ;
2021-08-10 11:57:25 +02:00
let urlsWithPid = 0 ;
2021-08-06 13:19:58 +02:00
let publications = 0 ;
let datasets = 0 ;
let software = 0 ;
let other = 0 ;
// comment out for communities query check
2021-06-16 10:03:07 +02:00
let communitiesPromise ;
let publicCommunities = [ ] ;
2021-06-15 15:19:37 +02:00
2021-08-06 13:19:58 +02:00
2021-07-12 16:09:55 +02:00
const resultsUrlPrefix = "https://services.openaire.eu/search/v2/api/resources2/?format=json" ;
const landingPrefix = "https://explore.openaire.eu/search/" ;
2021-08-10 11:57:25 +02:00
// let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=resulthostingdatasource&fields=country&page=0&size=0&type=";
2021-08-06 13:19:58 +02:00
// comment out for communities query check
2021-08-10 11:57:25 +02:00
let refineUrl = "https://services.openaire.eu/search/v2/api/resources2/?format=json&refine=true&fields=resultbestaccessright&fields=relfunder&fields=instancetypename&fields=resultlanguagename&fields=community&fields=resulthostingdatasource&fields=country&page=0&size=0&type=" ;
2021-07-12 16:09:55 +02:00
const contextUrl = "https://services.openaire.eu/openaire/contexts/" ;
2021-06-15 15:19:37 +02:00
2021-08-10 11:57:25 +02:00
// process.argc[3] is the "resultType" argument
buildSiteMap ( 200 , process . argv [ 3 ] ) ;