From f063556083249746b23315f175c07b37ed214cc8 Mon Sep 17 00:00:00 2001 From: Alfredo Oliviero Date: Fri, 12 Jul 2024 15:22:49 +0200 Subject: [PATCH] update_robots --- README.md | 39 +++++ update_robots/README.md | 35 ++++ update_robots/template-robots.txt | 74 +++++++++ update_robots/update_robots.py | 267 ++++++++++++++++++++++++++++++ 4 files changed, 415 insertions(+) create mode 100644 README.md create mode 100644 update_robots/README.md create mode 100644 update_robots/template-robots.txt create mode 100644 update_robots/update_robots.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..0b45cd7 --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# gCube System - collection of script for managing D4Science Responsive Liferay 6.2 Gateway + + +## Built With + +* [Python3 ](https://www.python.org/) + +## Change log + +See [Releases](https://code-repo.d4science.org/gCubeSystem/about-vre/releases). + +## Authors + +* **Alfredo Oliviero** - [ISTI-CNR Infrascience Group](http://nemis.isti.cnr.it/groups/infrascience) + +## Maintainers + +* **Alfredo Oliviero** - [ISTI-CNR Infrascience Group](http://nemis.isti.cnr.it/groups/infrascience) + +## License + +This project is licensed under the EUPL V.1.1 License - see the [LICENSE.md](LICENSE.md) file for details. + + +## About the gCube Framework +This software is part of the [gCubeFramework](https://www.gcube-system.org/ "gCubeFramework"): an +open-source software toolkit used for building and operating Hybrid Data +Infrastructures enabling the dynamic deployment of Virtual Research Environments +by favouring the realisation of reuse oriented policies. + +The projects leading to this software have received funding from a series of European Union programmes including: + +- the Sixth Framework Programme for Research and Technological Development + - DILIGENT (grant no. 004260); +- the Seventh Framework Programme for research, technological development and demonstration + - D4Science (grant no. 212488), D4Science-II (grant no.239019), ENVRI (grant no. 283465), EUBrazilOpenBio (grant no. 288754), iMarine(grant no. 283644); +- the H2020 research and innovation programme + - BlueBRIDGE (grant no. 675680), EGIEngage (grant no. 654142), ENVRIplus (grant no. 654182), Parthenos (grant no. 654119), SoBigData (grant no. 654024),DESIRA (grant no. 818194), ARIADNEplus (grant no. 823914), RISIS2 (grant no. 824091), PerformFish (grant no. 727610), AGINFRAplus (grant no. 731001); + \ No newline at end of file diff --git a/update_robots/README.md b/update_robots/README.md new file mode 100644 index 0000000..dbff799 --- /dev/null +++ b/update_robots/README.md @@ -0,0 +1,35 @@ +# update_robots.py + +## Description + +The update_robots.py script is designed to manage and update the false-robots.txt field in the group_ table of a PostgreSQL database used by Liferay, using a template file for definition. + +It provides various functionalities, including updating the field based on a template file, listing current virtual hosts (vhosts), listing current false-robots.txt values and their associated hostnames, saving the current vhosts to individual files, and printing the differences between current and new false-robots.txt values. + +The scripts reads the db configurations from the portal-setup-wizard.properties of the instance. + +## Examples of usage: + +1. Use a configuration file for database parameters and print the update queries: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --template-file template-robots.txt + +2. Specify database parameters directly from the command line and print the update queries: + python3 update_robots.py --db-host postgres --db-port 5432 --db-name liferay_db --db-user infra_bundle_dev --db-password pass_db --template-file template-robots.txt + +3. Execute the update queries: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --template-file template-robots.txt --execute + +4. Print the list of current vhosts: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --list-vhosts + +5. Print the list of current false-robots.txt values and related hostnames: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --list-robots + +6. Save the current vhosts to files: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --save-vhosts --output-dir currents_robots + +7. Update the false-robots.txt for a specific hostname: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --template-file template-robots.txt --hostname specific.hostname.com + +8. Print the differences between current and new false-robots.txt values: + python3 update_robots.py --config-file /home/life/Portal-Bundle/portal-setup-wizard.properties --template-file template-robots.txt --print-differences diff --git a/update_robots/template-robots.txt b/update_robots/template-robots.txt new file mode 100644 index 0000000..9988842 --- /dev/null +++ b/update_robots/template-robots.txt @@ -0,0 +1,74 @@ +User-agent: ClaudeBot +Disallow: / + +User-agent: Claude-Web +Disallow: / + +User-agent: SemrushBot +Disallow: / + +User-agent: SemrushBot-SA +Disallow: / + +User-agent: Yandex +Disallow: / + +User-agent: YandexBot +Disallow: / + +User-agent: Bytedance +Disallow: / + +User-agent: Bytespider +Disallow: / + + +User-agent: SemrushBot +Disallow: / + +User-agent: SemrushBot-SA +Disallow: / + +User-agent: Yandex +Disallow: / + +User-agent: YandexBot +Disallow: / + +User-agent: Bytedance +Disallow: / + +User-agent: Bytespider +Disallow: / + +User-agent: * +# Disallow: /home +# Disallow: /explore +Disallow: /exploratories +Disallow: /admin +Disallow: /private + +Disallow: /cookie-policy +Disallow: /terms-of-use +Disallow: /manage-invite + +Disallow: /authorization +Disallow: /group/ +Disallow: */group/ + +Disallow: /group/control_panel + +# Disallow: /catalogue-* + +# Disallow: /catalogue-aginfra +# Disallow: /catalogue-bluecloud +# Disallow: /catalogue-d4s +# Disallow: /catalogue-desira +# Disallow: /catalogue-imarine +# Disallow: /catalogue-sobigdata + +Disallow: /expression-of-interest +Disallow: /how-to-access-resources + + +Sitemap: https://{{HOSTNAME}}/sitemap.xml \ No newline at end of file diff --git a/update_robots/update_robots.py b/update_robots/update_robots.py new file mode 100644 index 0000000..64e52f6 --- /dev/null +++ b/update_robots/update_robots.py @@ -0,0 +1,267 @@ +# -*- coding: utf-8 -*- + +import psycopg2 +import re +import argparse +import os +import difflib +import sys + +# Default values +DEFAULT_CONFIG_FILE = "/home/life/Portal-Bundle/portal-setup-wizard.properties" +DEFAULT_TEMPLATE_FILE = "template-robots.txt" +DEFAULT_PLACEHOLDER = "{{HOSTNAME}}" +DEFAULT_OUTPUT_DIR = "currents_robots" + +# Function to read database connection parameters from the configuration file +def read_db_config(config_file_path): + config = {} + with open(config_file_path, 'r') as file: + for line in file: + if line.strip() and '=' in line and not line.startswith("#"): + key, value = line.strip().split('=', 1) + config[key.strip()] = value.strip() + return config + +# Function to read the content of the template file +def read_template_file(file_path): + with open(file_path, 'r') as file: + return file.read().strip() + +# Function to update the typesettings field for all groups containing false-robots.txt +def update_typesettings_for_all_groups(cursor, template_content, placeholder_pattern, execute_updates, specific_hostname=None): + try: + # Query to extract the necessary data with a join + query = """ + SELECT g.groupid, g.typesettings, v.hostname + FROM public.group_ g + JOIN public.layoutset l ON g.groupid = l.groupid + JOIN public.virtualhost v ON l.layoutsetid = v.layoutsetid + WHERE g.typesettings LIKE '%false-robots.txt%' + """ + if specific_hostname: + query += " AND v.hostname = %s" + cursor.execute(query, (specific_hostname,)) + else: + cursor.execute(query) + + rows = cursor.fetchall() + + for row in rows: + groupid, current_typesettings, hostname = row + + # Replace the placeholder in the template with the hostname value + new_false_robots = template_content.replace(placeholder_pattern, hostname) + + # Update the value of false-robots.txt + updated_typesettings = re.sub( + r'false-robots\.txt=[^¶]*', + f'false-robots.txt={new_false_robots}', + current_typesettings, + flags=re.DOTALL + ) + + # Update query + update_query = f"UPDATE public.group_ SET typesettings = %s WHERE groupid = %s;" + if execute_updates: + cursor.execute(update_query, (updated_typesettings, groupid)) + else: + print(update_query % (updated_typesettings, groupid)) + + except Exception as e: + print(f"Error during update: {e}") + +# Function to print the list of current vhosts +def print_current_vhosts(cursor): + try: + cursor.execute(""" + SELECT v.hostname + FROM public.virtualhost v + """) + rows = cursor.fetchall() + print("List of current vhosts:") + for row in rows: + print(row[0]) + except Exception as e: + print(f"Error retrieving vhosts: {e}") + +# Function to print the list of current false-robots.txt values +def print_current_robots(cursor): + try: + cursor.execute(""" + SELECT g.typesettings, v.hostname + FROM public.group_ g + JOIN public.layoutset l ON g.groupid = l.groupid + JOIN public.virtualhost v ON l.layoutsetid = v.layoutsetid + WHERE g.typesettings LIKE '%false-robots.txt%' + """) + rows = cursor.fetchall() + print("List of current false-robots.txt values and related hostnames:") + for row in rows: + match = re.search(r'false-robots\.txt=([^¶]*)', row[0]) + if match: + print(f"Hostname: {row[1]} - Robots: {match.group(1)}") + except Exception as e: + print(f"Error retrieving false-robots.txt values: {e}") + +# Function to save the current vhosts to files +def save_current_vhosts(cursor, output_dir): + try: + os.makedirs(output_dir, exist_ok=True) + cursor.execute(""" + SELECT v.hostname, g.typesettings + FROM public.virtualhost v + JOIN public.layoutset l ON v.layoutsetid = l.layoutsetid + JOIN public.group_ g ON l.groupid = g.groupid + WHERE g.typesettings LIKE '%false-robots.txt%' + """) + rows = cursor.fetchall() + for row in rows: + hostname, typesettings = row + file_path = os.path.join(output_dir, f"{hostname}.txt") + with open(file_path, 'w') as file: + match = re.search(r'false-robots\.txt=([^¶]*)', typesettings) + if match: + file.write(match.group(1)) + print(f"Saved {hostname} robots.txt to {file_path}") + except Exception as e: + print(f"Error saving vhosts: {e}") + +# Function to print differences between current and new robots.txt +def print_robots_differences(cursor, template_content, placeholder_pattern, specific_hostname=None): + try: + query = """ + SELECT g.typesettings, v.hostname + FROM public.group_ g + JOIN public.layoutset l ON g.groupid = l.groupid + JOIN public.virtualhost v ON l.layoutsetid = v.layoutsetid + WHERE g.typesettings LIKE '%false-robots.txt%' + """ + if specific_hostname: + query += " AND v.hostname = %s" + cursor.execute(query, (specific_hostname,)) + else: + cursor.execute(query) + + rows = cursor.fetchall() + for row in rows: + current_typesettings, hostname = row + new_false_robots = template_content.replace(placeholder_pattern, hostname) + match = re.search(r'false-robots\.txt=([^¶]*)', current_typesettings) + if match: + current_false_robots = match.group(1) + diff = difflib.unified_diff(current_false_robots.splitlines(), new_false_robots.splitlines(), lineterm='') + print(f"Differences for hostname {hostname}:") + print('\n'.join(diff)) + except Exception as e: + print(f"Error retrieving differences: {e}") + +# Main function +def main(): + parser = argparse.ArgumentParser( + description="Update the false-robots.txt field in the group_ table.", + formatter_class=argparse.RawTextHelpFormatter, + epilog=f""" +Examples of usage: + +1. Use a configuration file for database parameters and print the update queries: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --template-file {DEFAULT_TEMPLATE_FILE} + +2. Specify database parameters directly from the command line and print the update queries: + python3 update_robots.py --db-host postgres --db-port 5432 --db-name liferay_db --db-user infra_bundle_dev --db-password pass_db --template-file {DEFAULT_TEMPLATE_FILE} + +3. Execute the update queries: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --template-file {DEFAULT_TEMPLATE_FILE} --execute + +4. Print the list of current vhosts: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --list-vhosts + +5. Print the list of current false-robots.txt values and related hostnames: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --list-robots + +6. Save the current vhosts to files: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --save-vhosts --output-dir {DEFAULT_OUTPUT_DIR} + +7. Update the false-robots.txt for a specific hostname: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --template-file {DEFAULT_TEMPLATE_FILE} --hostname specific.hostname.com + +8. Print the differences between current and new false-robots.txt values: + python3 update_robots.py --config-file {DEFAULT_CONFIG_FILE} --template-file {DEFAULT_TEMPLATE_FILE} --print-differences + """) + + parser.add_argument("--config-file", default=DEFAULT_CONFIG_FILE, help=f"Path to the configuration file with database parameters (default: {DEFAULT_CONFIG_FILE})") + parser.add_argument("--db-host", help="Database host") + parser.add_argument("--db-port", type=int, help="Database port") + parser.add_argument("--db-name", help="Database name") + parser.add_argument("--db-user", help="Database user") + parser.add_argument("--db-password", help="Database password") + parser.add_argument("--template-file", default=DEFAULT_TEMPLATE_FILE, help=f"Path to the template file for false-robots.txt (default: {DEFAULT_TEMPLATE_FILE})") + parser.add_argument("--placeholder", default=DEFAULT_PLACEHOLDER, help=f"Placeholder pattern to replace in the template file (default: {DEFAULT_PLACEHOLDER})") + parser.add_argument("--execute", action="store_true", help="Execute the update queries (default is to print the queries)") + parser.add_argument("--list-vhosts", action="store_true", help="Print the list of current vhosts") + parser.add_argument("--list-robots", action="store_true", help="Print the list of current false-robots.txt values and related hostnames") + parser.add_argument("--save-vhosts", action="store_true", help=f"Save the current vhosts to files (default directory: {DEFAULT_OUTPUT_DIR})") + parser.add_argument("--output-dir", default=DEFAULT_OUTPUT_DIR, help=f"Output directory to save the vhosts (default: {DEFAULT_OUTPUT_DIR})") + parser.add_argument("--hostname", help="Specific hostname to update") + parser.add_argument("--print-differences", action="store_true", help="Print differences between current and new false-robots.txt values") + + if len(sys.argv)==1: + parser.print_help(sys.stderr) + sys.exit(1) + + args = parser.parse_args() + + # Read the template file content + template_content = read_template_file(args.template_file) + + # Determine the database connection parameters + if args.config_file: + db_config = read_db_config(args.config_file) + db_host = db_config['jdbc.default.url'].split('/')[2].split(':')[0] + db_port = int(db_config['jdbc.default.url'].split('/')[2].split(':')[1]) + db_name = db_config['jdbc.default.url'].split('/')[-1] + db_user = db_config['jdbc.default.username'] + db_password = db_config['jdbc.default.password'] + + if args.db_host: + db_host = args.db_host + + if args.db_port: + db_port = args.db_port + + if args.db_name: + db_name = args.db_name + + if args.db_user: + db_user = args.db_user + + if args.db_password: + db_password = args.db_password + + # Connect to the database + conn = psycopg2.connect( + host=db_host, + port=db_port, + dbname=db_name, + user=db_user, + password=db_password + ) + cursor = conn.cursor() + + if args.list_vhosts: + print_current_vhosts(cursor) + elif args.list_robots: + print_current_robots(cursor) + elif args.save_vhosts: + save_current_vhosts(cursor, args.output_dir) + elif args.print_differences: + print_robots_differences(cursor, template_content, args.placeholder, args.hostname) + else: + update_typesettings_for_all_groups(cursor, template_content, args.placeholder, args.execute, args.hostname) + + conn.commit() + cursor.close() + conn.close() + +if __name__ == "__main__": + main()