diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6236f9f3b1..02bb77fe9a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,7 +40,6 @@ jobs: ELASTICSEARCH_WRITE_AUTH: ${{ secrets.ELASTICSEARCH_WRITE_AUTH }} run: | npm run antora - bin/optimize_crawl -x echo 'doc.owncloud.com' > public/CNAME - name: Save cache diff --git a/bin/optimize_crawl b/bin/optimize_crawl deleted file mode 100755 index fc1180f856..0000000000 --- a/bin/optimize_crawl +++ /dev/null @@ -1,400 +0,0 @@ -#!/usr/bin/env bash - -# optimizes sitemap-xxx.xml and robots.txt for improved crawling as we only need references to "next" urls. -# -# 1.) deletes in all sitemap-.xml files all xml blocks not containing the term "next" (DEFAULT_NAME) -# post the product name. this is done only for only those xml files, which have the new docs structure -# containing "next" as part of the url for the master branch. -# -# 2.) if uncommented in the code, adds at the end of the robots.txt file "Disallow" directives for all -# non "next" urls/* which match the new docs structure containing "next", e.g. Disallow: /server/10.8/* -# -# the script is intended to be called from the root of this repo. -# -# see the "PRODUCTS" variable for eligable repos -# -# note, no file should be created manually or by an other process with the extension defined in BACKUP_EXT. -# -# -# SEE IMPORTANT COMMENT AT ABOUT LINE 200 - -set -e -set -u -set -o noclobber -set -o errexit -set -o pipefail -set -o nounset -IFS=$'\n\t' - -# these are the products which have the new doc structure using a version, at minimum 'next'. -# only those are taken for optimisation! -# the name must be identical to the name tag in antora.yml of the particular doc repo. -PRODUCTS=(docs server ocis webui desktop ios-app android branding) - -# the location where the site is built to -HTML_ROOT="public" - -# the name of the master sitemap file -# this file includes all other sitemaps -SITEMAP="sitemap.xml" - -# define common variables -ACTION= -DOC_WEB_ROOT= -SITEMAPS_TO_USE= -DRY_RUN=false - -# note that the extension must include the leading dot -BACKUP_EXT=".original" - -# the default name is the name used when the branch is master. -# only for those products which have the new doc structure. -DEFAULT_NAME="next" - -# error variables -ERR_UNSUPPORTED_ACTION=22 - -SITEMAP_ROOT_FILE="${HTML_ROOT}/${SITEMAP}" - -# the main function which processes all files -function run() -{ - local i= - local url= - local hit= - local first= - local actual=() - local product_name= - local current_map= - local current_robot= - local product_name= - local unuseable_releases=() - local sitemap_header= - local url_component_ok= - local new_xml_content= - local add_robot_content= - - current_robot="${HTML_ROOT}/robots.txt" - - # find all files in the html root dir, and only there, where the extension is backup_ext - # if found, those files should be reverted back to the original state before continue executing - for i in `find "${HTML_ROOT}" -maxdepth 1 -name "*${BACKUP_EXT}" -type f`; do - hit=true - done - if [ "$hit" = true ]; then - echo - echo -e "\e[1;31mThere were files found indicating a past run. \e[0m" - echo -e "\e[1;31mRevert them first. \e[0m" - usage - exit $ERR_UNSUPPORTED_ACTION - fi - - echo -e "\e[1;32mOptimizing \e[0m" - echo - - # changes in the robots file only when not in debug mode - if [[ "$DRY_RUN" == false ]]; then - # create a copy of the robots file to make a backup - cp "${current_robot}" "${current_robot}${BACKUP_EXT}" - - # add a blank line for better visibility to the following disallows - # (not using $'\n'... becomes a double blank line when used alone) - cat <<< "" >> "${current_robot}" - fi - - # get the current existing docs url from sitemap.xml - get_doc_www_root "${SITEMAP_ROOT_FILE}" - - # get the list of sitemaps to process - get_sitemaps "${SITEMAP_ROOT_FILE}" "${PRODUCTS[@]}" - #printf '%s\n' "${SITEMAPS_TO_USE[@]}" - - # iterate over all sitemap files - for current_map in "${SITEMAPS_TO_USE[@]}" - do - - new_xml_content= - - #echo "${current_map}" - # get the product name of the current iterating map - product_name=($(get_product_name "${current_map}")) - # echo "$product_name" - - # get all the releases we do not need to keep - unuseable_releases=($(get_unuseable_releases "${current_map}")) - # printf '%s\n' "${unuseable_releases[@]}" - -# keep this, for testing purposes only -# # get the header of the sitemap file -# sitemap_header=$(get_sitemap_header "${current_map}") -# #echo "$sitemap_header" - - url_component_ok="${DOC_WEB_ROOT}"/"${product_name}"/"${DEFAULT_NAME}"/ - - i=-1 - url=false - hit=false - first=true - unset actual - - # read the current xml file and remove all non-"next" entries - while read line; do - i=$((i+1)) - actual[$i]="${line}" - - # starting the block. we expect that are in seperate lines - if [[ "${line}" == "" ]]; then - url=true - first=false - else - # get all lines before the first , there is always necessary stuff above to be taken - # we do not need an array here but add the lines one by one - if [ "$first" = true ]; then - new_xml_content="${new_xml_content}""${line}"$'\n' - # unset the array to remove any leftovers for the first match of - # necessary as we do not know if the first url block will be a hit or not - unset actual - #printf '%s\n' "${new_xml_content[@]}" - continue - fi - fi - - # we are inside a block - if [ "$url" = true ]; then - # check if it is a block - if [[ "${line}" == ""* ]]; then - # check if it contains a usable url with "next" - if [[ "${line}" == *"${url_component_ok}"* ]]; then - hit=true - #echo "${line}" - fi - fi - fi - - # at the end of the block - if [[ "${line}" == "" ]]; then - # use only if we have a hit - if [ "$hit" == true ]; then - #printf '%s\n' "${actual[@]}" - new_xml_content="${new_xml_content}"$(printf '%s\n' "${actual[@]}")$'\n' - fi - # if we had a hit, reset all - i=-1 - url=false - hit=false - unset actual - fi - - done < "${current_map}" - - # finally, add all the remaining elements which are post the last - # the last actual array contains all lines after the last - new_xml_content="${new_xml_content}"$(printf '%s\n' "${actual[@]}") - - # create a disallow entry for the particular product containig non- "next" entries - unset actual - add_robot_content= #$'\n' - for actual in "${unuseable_releases[@]}" - do - #echo $actual - url_component_ok=/"${product_name}"/"${actual}"/* -# -# NOTE the following script line is temporarily commented. -# -# Google can't reindex a site where robots.txt disallows urls which are, when accessed, redirected to a disallowed -# page, even robots.txt is valid and does not disallow the site itself. -# -# robots.txt --> disallow: /server/10.8/* -# docs.owncloud.com --> docs.owncloud.com/server/latest --> docs.owncloud.com/server/10.8 (** bang **) -# -# when server gets its own repository and the main site entry does not automatically redirect to server/latest, -# this comment can be removed and the disallow is added again. -# -# add_robot_content="${add_robot_content}""Disallow: ""${url_component_ok}"$'\n' - done - - # echo the output in debug mode but do not do anything else - # make the headline green for better identification of the current xml file - if [[ "$DRY_RUN" == true ]]; then - echo - echo -e "\e[1;32m${current_map} \e[0m" - echo - echo "${new_xml_content}" - echo - echo -e "\e[1;32mTo be added to ${HTML_ROOT}/robots.txt\e[0m" - echo "${add_robot_content}" - else - # write the changes to the files - # backup the current sitemap file - mv "${current_map}" "${current_map}${BACKUP_EXT}" - # create a new sitemap file and write the new contents - cat <<< "$new_xml_content" > "$current_map" - # append the contents to the new robots file - cat <<< "$add_robot_content" >> "${current_robot}" - fi - - # for testing to make only one block -# break - done - - # remove any emtpy lines at the END of the script - # this keeps the content compact if not adding disallows - sed -i -e :a -e '/^\n*$/{$d;N;ba' -e '}' "${current_robot}" - -} - -# get the header lines of the current sitemap file -function get_sitemap_header() -{ - # example - # - # - local sh= - sh=$(sed '//Q' "$1") - echo "${sh}" -} - -# get all unusabe releases of the current sitemap file -function get_unuseable_releases() -{ - local a=() - local ur=() - - # make an array of the default name to intersect with the existing names - local name=( "${DEFAULT_NAME}" ) - - ur=($(grep -oP '(?<=loc>)[^<]+' "$1" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | cut -d \/ -f 2 | sort | uniq)) - - # intersect the arrays - # the array must be returned in this case with printf and not with echo - readarray -t a < <(echo ${ur[@]} ${name[@]} | tr ' ' '\n' | sort | uniq -d | xargs echo ${ur[@]} | tr ' ' '\n' | sort | uniq -u) - printf '%s\n' "${a[@]}" -} - -# get the product name of a particular sitemap file -function get_product_name() -{ - local pn= - pn=($(grep -oP '(?<=loc>)[^<]+' "$1" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | cut -d \/ -f 1 | uniq)) - echo "${pn}" -} - -# query the URL of the docs site. -# this can also be a local address (e.g. ip:port) if you have built e.g. with "yarn antora-local". -# will be used to identify a deletable block in a sitemap file -function get_doc_www_root() -{ - DOC_WEB_ROOT=($(grep -oP '(?<=loc>)[^<]+' "$1" | cut -d/ -f1-3 |uniq)) - #echo "${DOC_WEB_ROOT}" -} - -# get_sitemaps -# get all sitemap files which are available and match the product array list -# using parameters helps customizing the function easily -function get_sitemaps() -{ - local a=() # helper variable - local sitemap_base_file="$1" # Save first argument in a variable - shift # Shift all arguments to the left (original $1 gets "lost") - local b=("$@") # Rebuild the products array with rest of arguments - local found_sitemaps=() # the list of sitemap files found - - # append .xml to each availabe product and sort the outcome - a=( "${b[@]/%/.xml}" ) - readarray -t available_products < <(printf '%s\n' "${a[@]}" | sort) - #printf '%s\n' "${available_products[@]}" - - # get the list of referenced sitemaps from the main sitemap file and store it into an array - # the name is stored without the extension ".xml" - # the files are defined inbetween the tags - # for matching the arrays, we need to remove the leading "sitemap-" string - readarray -t found_sitemaps < <(grep -oP '(?<=loc>)[^<]+' "${sitemap_base_file}" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | sed 's/sitemap-//g' | sort) - #printf '%s\n' "${found_sitemaps[@]}" - - # Intersection of possible and available sitmap files - readarray -t a < <(echo ${available_products[@]} ${found_sitemaps[@]} | tr ' ' '\n' | sort | uniq -d) - a=( "${a[@]/#/sitemap-}" ) - #printf '%s\n' "${a[@]}" - SITEMAPS_TO_USE=( "${a[@]/#/${HTML_ROOT}/}" ) - #printf '%s\n' "${SITEMAPS_TO_USE[@]}" -} - -# rename all files with the extension ${BACKUP_EXT} back to its original extension. -# this is only when playing around and you do not want to manually type all the commands -function revert() -{ - local f= - - echo - # find all files in the html root dir, and only there, where the extension is backup_ext - for f in `find "${HTML_ROOT}" -maxdepth 1 -name "*${BACKUP_EXT}" -type f`; do - # move the file by removing the backup extension - mv -- "${f}" "${f%${BACKUP_EXT}}" - echo -e "\e[1;32mReverting file: \e[0m" "${f}" - #echo "${f%${BACKUP_EXT}}" - #echo "$f" - done - # if f is empty there was nothing found to be reverted - if [ -z "$f" ]; then - echo -e "\e[1;32mNothing found to be reverted \e[0m" "${f}" - fi - echo -} - -function revert_and_run() -{ - revert - run -} - -function usage() -{ - echo - echo "Usage: bin/optimize_crawl [-h] [-e] [-x] [-d] [-r]" - echo - echo "-h ... help" - echo "-e ... Execute (add debug mode for a dry run)" - echo "-x ... Revert first and Execute (add debug mode for a dry run)" - echo "-d ... Debug mode, print only of the optimized sitmap content, no saving." - echo "-r ... Revert backuped files to originals if exists." - echo -} - -while getopts ":rdexh:" o -do - case ${o} in - r) - ACTION="REVERT" - ;; - d ) - DRY_RUN=true - ;; - e) - ACTION="RUN" - ;; - x) - ACTION="REVERT_AND_RUN" - ;; - h | * ) - ACTION="HELP" - ;; - esac -done - -shift $((OPTIND-1)) - -case "$ACTION" in - REVERT) - revert - ;; - RUN) - run - ;; - REVERT_AND_RUN) - revert_and_run - ;; - HELP | *) - usage - exit $ERR_UNSUPPORTED_ACTION - ;; -esac diff --git a/ext-antora/sitemap-cleanup.js b/ext-antora/sitemap-cleanup.js new file mode 100644 index 0000000000..f9929281e5 --- /dev/null +++ b/ext-antora/sitemap-cleanup.js @@ -0,0 +1,320 @@ +"use strict" + +/** + * Cleanup sitemap files - remove any links that should not be kept - SEO and search relevant + * Version 1.0.0 + * + * @param {Object} playbook configuration object - The configuration object for Antora + * @param {Object} config configuration object - Configuration provided by the playbook + * @param {Object} playbook.site - Site-related configuration data. + * @param {String} playbook.site.url - The base URL of the site. + * @param {Object} playbook.output - Output-related configuration data. + * @param {String} playbook.output.dir - The output directory of the build. + * @param {Array} [config.validsegments] - Segments to keep in sitemap files ['next', 'latest', ...] + * @param {} [config.preferredsegment] - Optional, segment (singular) to keep if multiple segments are present + */ + +const fs = require('fs') +const smf = 'sitemap.xml' +const sitemap_keyword = 'sitemap-' + +module.exports.register = function ({ config }) { + this.on("sitePublished", async ({ playbook, publications }) => { + + // get relevant Antora data + + const logger = this.getLogger('sitemap-cleanup') + const siteUrl = playbook.site.url + const outputDir = publications[0].resolvedPath + + // read playbook config data + + let validSegments = [] + if(config.validsegments) { + config.validsegments.forEach(element => validSegments.push(element)) + //validSegments.forEach(element => logger.warn(element)) + } else { + console.log() + logger.warn('Config: No valid segment(s) found to keep for sitemaps. Exiting \n') + this.stop() + } + + // note that preferredsegment is a string and not an array in the playbook + // if omitted, validSegments are used + let preferredSegments = validSegments + if(config.preferredsegment) { + if (validSegments.includes(config.preferredsegment)) { + preferredSegments = [] + preferredSegments.push(config.preferredsegment) + } else { + console.log() + logger.warn('Config: preferredsegment, if set, must match one of the validsegments. Exiting \n') + this.stop() + } + } + + // sitemaps processed will be printed to the console + let printSitemapFound = null + if(config.printsitemapfound) { + printSitemapFound = true + } else { + printSitemapFound = false + } + + // changed content of sitemaps processed will be printed to the console + let printContent = null + if(config.printcontent) { + printContent = true + } else { + printContent = false + } + + // check if there is a sitemap file present and load its content + // return to Antora if no sitemap file is found + + var content = null + const main_sitemap_file = outputDir + '/' + smf + try { + content = await get_file_from_local(main_sitemap_file) + } catch (e) { + console.log('\n') + logger.warn('There is an error accessing the ' + smf + ' file. Continuing with Antora') + if (e.code === 'ENOENT') { + logger.warn(smf + ' not found at:') + logger.warn(main_sitemap_file) + } else { + logger.warn(e) + } + console.log('\n') + return + } + + // based on the content of the main sitemap file, check which ones to parse + // return to Antora if no parsable tags are found: ( | + + var parsable_sitemaps = [] + if (content.includes('')) { + // the sitemap file itself uses url tags which only occurs when no sub-sitemaps are used + parsable_sitemaps.push(outputDir + '/' + smf) + } else if (content.includes('')) { + // check for sub-sitemaps + parsable_sitemaps = get_parsable_sitemap_files (content, siteUrl + '/', outputDir + '/') + if (!parsable_sitemaps.length) { + console.log('\n') + logger.warn('No parseable sub-sitemaps found in '+ smf + ' Continuing with Antora') + console.log('\n') + return + } + } else { + // nothing found at all, something must went wrong with the sitemap created by Antora + console.log('\n') + logger.warn('No parseable content found in '+ smf + ' Continuing with Antora') + console.log('\n') + return + } + + // parse each sitemap, replace content on matches and save the result: + // remove any entries that are NOT defined via config variables + + console.log() + parsable_sitemaps.forEach(element => parse_sitemap_file(parsable_sitemaps.length, outputDir, content, + siteUrl, validSegments, preferredSegments, printSitemapFound, printContent, logger, element)) + + }) + +} + +/** + * parse the sitemap file that is handed over and save any changes back + * + * @param {integer} total_sitemaps the number of sitemap files found + * @param {string} content the content of sitemap.xml + * @param {string} site_url the siteURL to build for + * @param {string} output_dir the directory where the sitemap files are located + * @param {bool} print_sitemap_found print the sitemap file component name + * @param {bool} print_content print the changed sitemap file content + * @param {array} valid_segments array of segments that are technically valid + * @param {array} preferred_segments array of segments that will be more than one valid_segments + * @param {object} logger the logger object + * @param {string} file the sitemap file to process + * @return no data returnd + */ +async function parse_sitemap_file (total_sitemaps, output_dir, content, site_url, valid_segments, + preferred_segments, print_sitemap_found, print_content, logger, file) { + + // get the name of the component from the filename or the contents of the file + // the component name is important to properly strip off all data from the left that is not required + // for multi version components, this will always be a component name + // for single version components: either empty || a component name + var component = '' + if (total_sitemaps > 1) { + // because there are more sitemap files, the component name can be derived from the filename + component = file.replace(output_dir + '/', '').replace(sitemap_keyword, '').replace('.xml', '') + } else { + // because there is only one sitemap file, the component name must be gathered from the contents + const first_match = content.match(/([^<]*)<\/loc>/) + const temp_string = first_match[0].replace('','').replace('','').replace(site_url + '/','') + const index = temp_string.indexOf('/') + component = temp_string.substring(0, index) + } + + // if the compoentn name is empty, there is nothing to do, because it is a versionless ROOT component + if (component.length === 0) return + + // get the contents of the sitemap file + try { + var sitemap_content = await get_file_from_local(file) + } catch (e) { + console.log('\n') + logger.warn('Cant access file. Continuing with next sitemap file.') + if (e.code === 'ENOENT') { + logger.warn(file + ' not found') + } else { + logger.warn(e) + } + console.log('\n') + return + } + + // print the component name of the sitemap file found + if (print_sitemap_found) { + logger.warn('Processing component: ' + component) + } + + // get all multiline blocks with ... + const all_url_matches = sitemap_content.match(/(?:)[\s\S]*?(?:<\/url>)/gm) + + // if there is no ... structure, return, nothing needs to be done for this file + if (all_url_matches.length === 0) { + if (print_sitemap_found) { + logger.warn('No processable data found, continuing') + } + return + } + + + var i = null + var hit = null + var version = null + var has_changed = false + + for (i=0; i < all_url_matches.length; i++) { + hit = true + // extract the version from the url. here, a location always has a version. no version case is sorted out above + // http://localhost:8080/component/version/file.html + version = all_url_matches[i].match(/([^<]*)<\/loc>/gm).toString() + version = version.replace('','').replace('', '').replace(site_url + '/', '').replace(component + '/', '') + version = version.substring(0, version.indexOf("/")) + + // check if there is a miss. version found does not match the version allowed + if (valid_segments.includes(version)) { + if (preferred_segments.includes(version)) { + hit = false + has_changed = true + } + } + + // the block identified needs to be removed + if (hit) { + sitemap_content = sitemap_content.replace(all_url_matches[i], '') + } + } + + // if nothing has changed, return + if (!has_changed) { + if (print_sitemap_found) { + logger.warn('The content of the sitemap was not changed') + } + return + } + + // remove all blank lines caused by the removal process + sitemap_content = removeEmptyLinesRegex(sitemap_content) + + // print the content of the sitemap file + if (print_content) { + logger.warn('New content:') + console.log(sitemap_content) + } + + // write the changed content back to file + // on error, we cant predict most common ones, so we print the complete message + try { + fs.writeFileSync(file, sitemap_content, {encoding: 'utf8', mode: 0o664}) + } catch (e) { + console.log(e) + } + + if (print_sitemap_found) { + logger.warn('New sitemap data successfully written back') + } + + return +} + +/** + * return the string content with removed blank lines + * + * @param {string} content string that contains possibly blank lines + * @return {array} result string that has t + * + */ +function removeEmptyLinesRegex(content) { + // Regex explanation: + // ^: Matches the beginning of a line + // \s*: Matches zero or more whitespace characters (space, tab, newline, etc.) + // $: Matches the end of a line + // /gm: Global (g) and multiline (m) flags + // 'g' ensures all matches are replaced, not just the first. + // 'm' ensures '^' and '$' match the start/end of each line, not just the entire string. + return content.replace(/^\s*[\r\n]/gm, '').replace(/(\r\n|\r|\n){2,}/g, '$1'); +} + +/** + * return an array with parsable sitemap files + * + * @param {string} content the content of sitemap.xml + * @param {string} site_url the siteURL to build for + * @param {string} output_dir the directory where the sitemap files are located + * @return {array} result a list of full-path sitmap files to parse + */ +function get_parsable_sitemap_files (content, site_url, output_dir) { +// https://www.linkedin.com/pulse/parsing-xml-javascript-python-sergiu-panaite-1 + const all_matches = content.match(/([^<]*)<\/loc>/g) + // all_matches.forEach(element => console.log(element)) + + // exit if no sub-sitemaps can be found, then the array is not initialized + if (!Array.isArray(all_matches)) { return [] } + + // strip off the xml tag and the site_url + var i = null + var temp_res = [] + for (i=0; i < all_matches.length; i++) { + temp_res[i] = all_matches[i].replace('','').replace('','').replace(site_url,'') + } + //temp_res.forEach(element => console.log(element)) + + // add the path + const result = temp_res.map(element => `${output_dir}${element}`) + // result.forEach(element => console.log(element)) + + return result +} + +/** + * read the contents of a given file + * + * @param {string} file the file to read + * @return {string} data the data from the file read + * @return {object} error the error if thrown, needs try/catch from the caller + */ +function get_file_from_local(file) { + // promise a file from local filesystem + // when executed it returns the contents if found + return new Promise((resolve, reject) => { + fs.readFile(file, 'utf8', (error, data) => { + if (error) reject(error) + resolve(data) + }) + }) +} diff --git a/site.yml b/site.yml index 6a9437e227..ca5966e608 100644 --- a/site.yml +++ b/site.yml @@ -85,6 +85,12 @@ antora: #attributefile: https://raw.githubusercontent.com/owncloud/docs/refs/heads/master/global-attributes.yml attributefile: ./global-attributes.yml enabled: true + - require: ./ext-antora/sitemap-cleanup.js + validsegments: ['next', 'latest'] + preferredsegment: latest + printsitemapfound: true + printcontent: false + enabled: true # for testing only, prints out attributes used # use only one or the other, output can be big #- ./ext-antora/attributes-used-in-site-yml.js