diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6236f9f3b1..02bb77fe9a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -40,7 +40,6 @@ jobs:
           ELASTICSEARCH_WRITE_AUTH: ${{ secrets.ELASTICSEARCH_WRITE_AUTH }}
         run: |
           npm run antora
-          bin/optimize_crawl -x
           echo 'doc.owncloud.com' > public/CNAME
 
       - name: Save cache
diff --git a/bin/optimize_crawl b/bin/optimize_crawl
deleted file mode 100755
index fc1180f856..0000000000
--- a/bin/optimize_crawl
+++ /dev/null
@@ -1,400 +0,0 @@
-#!/usr/bin/env bash
-
-# optimizes sitemap-xxx.xml and robots.txt for improved crawling as we only need references to "next" urls.
-#
-# 1.) deletes in all sitemap-<product>.xml files all xml blocks not containing the term "next" (DEFAULT_NAME)
-# post the product name. this is done only for only those xml files, which have the new docs structure
-# containing "next" as part of the url for the master branch.
-#
-# 2.) if uncommented in the code, adds at the end of the robots.txt file "Disallow" directives for all
-# non "next" urls/* which match the new docs structure containing "next", e.g. Disallow: /server/10.8/*
-#
-# the script is intended to be called from the root of this repo.
-#
-# see the "PRODUCTS" variable for eligable repos
-#
-# note, no file should be created manually or by an other process with the extension defined in BACKUP_EXT.
-#
-#
-# SEE IMPORTANT COMMENT AT ABOUT LINE 200
-
-set -e
-set -u
-set -o noclobber
-set -o errexit
-set -o pipefail
-set -o nounset
-IFS=$'\n\t'
-
-# these are the products which have the new doc structure using a version, at minimum 'next'.
-# only those are taken for optimisation!
-# the name must be identical to the name tag in antora.yml of the particular doc repo.
-PRODUCTS=(docs server ocis webui desktop ios-app android branding)
-
-# the location where the site is built to
-HTML_ROOT="public"
-
-# the name of the master sitemap file
-# this file includes all other sitemaps
-SITEMAP="sitemap.xml"
-
-# define common variables
-ACTION=
-DOC_WEB_ROOT=
-SITEMAPS_TO_USE=
-DRY_RUN=false
-
-# note that the extension must include the leading dot
-BACKUP_EXT=".original"
-
-# the default name is the name used when the branch is master.
-# only for those products which have the new doc structure.
-DEFAULT_NAME="next"
-
-# error variables
-ERR_UNSUPPORTED_ACTION=22
-
-SITEMAP_ROOT_FILE="${HTML_ROOT}/${SITEMAP}"
-
-# the main function which processes all files
-function run()
-{
-	local i=
-	local url=
-	local hit=
-	local first=
-	local actual=()
-	local product_name=
-	local current_map=
-	local current_robot=
-	local product_name=
-	local unuseable_releases=()
-	local sitemap_header=
-	local url_component_ok=
-	local new_xml_content=
-	local add_robot_content=
-
-	current_robot="${HTML_ROOT}/robots.txt"
-
-	# find all files in the html root dir, and only there, where the extension is backup_ext
-	# if found, those files should be reverted back to the original state before continue executing
-	for i in `find "${HTML_ROOT}" -maxdepth 1 -name "*${BACKUP_EXT}" -type f`; do
-		hit=true
-	done
-	if [ "$hit" = true ]; then
-		echo
-		echo -e "\e[1;31mThere were files found indicating a past run. \e[0m"
-		echo -e "\e[1;31mRevert them first. \e[0m"
-		usage
-		exit $ERR_UNSUPPORTED_ACTION
-	fi
-
-	echo -e "\e[1;32mOptimizing \e[0m"
-	echo
-
-	# changes in the robots file only when not in debug mode
-	if [[ "$DRY_RUN" == false ]]; then
-		# create a copy of the robots file to make a backup
-		cp "${current_robot}" "${current_robot}${BACKUP_EXT}"
-
-		# add a blank line for better visibility to the following disallows
-		# (not using $'\n'... becomes a double blank line when used alone)
-		cat <<< "" >> "${current_robot}"
-	fi
-
-	# get the current existing docs url from sitemap.xml
-	get_doc_www_root "${SITEMAP_ROOT_FILE}"
-
-	# get the list of sitemaps to process
-	get_sitemaps "${SITEMAP_ROOT_FILE}" "${PRODUCTS[@]}"
-	#printf '%s\n' "${SITEMAPS_TO_USE[@]}"
-
-	# iterate over all sitemap files
-	for current_map in "${SITEMAPS_TO_USE[@]}"
-	do
-
-		new_xml_content=
-
-		#echo "${current_map}"
-		# get the product name of the current iterating map
-		product_name=($(get_product_name "${current_map}"))
-		# echo "$product_name"
-
-		# get all the releases we do not need to keep
-		unuseable_releases=($(get_unuseable_releases "${current_map}"))
-		# printf '%s\n' "${unuseable_releases[@]}"
-
-# keep this, for testing purposes only
-#		# get the header of the sitemap file
-#		sitemap_header=$(get_sitemap_header "${current_map}")
-#		#echo "$sitemap_header"
-
-		url_component_ok="${DOC_WEB_ROOT}"/"${product_name}"/"${DEFAULT_NAME}"/
-
-		i=-1
-		url=false
-		hit=false
-		first=true
-		unset actual
-
-		# read the current xml file and remove all non-"next" entries
-		while read line; do
-			i=$((i+1))
-			actual[$i]="${line}"
-
-			# starting the block. we expect that <url> </url> are in seperate lines
-			if [[ "${line}" == "<url>"  ]]; then
-				url=true
-				first=false
-			else
-				# get all lines before the first <url>, there is always necessary stuff above to be taken
-				# we do not need an array here but add the lines one by one
-				if [ "$first" = true ]; then
-					new_xml_content="${new_xml_content}""${line}"$'\n'
-					# unset the array to remove any leftovers for the first match of <url>
-					# necessary as we do not know if the first url block will be a hit or not
-					unset actual
-					#printf '%s\n' "${new_xml_content[@]}"
-					continue
-				fi
-			fi
-
-			# we are inside a block
-			if [ "$url" = true ]; then
-				# check if it is a <loc> block
-				if [[ "${line}" == "<loc>"* ]]; then
-					# check if it contains a usable url with "next"
-					if [[ "${line}" == *"${url_component_ok}"* ]]; then
-						hit=true
-						#echo "${line}"
-					fi
-				fi
-			fi
-
-			# at the end of the block
-			if [[ "${line}" == "</url>"  ]]; then
-				# use only if we have a hit
-				if [ "$hit" == true ]; then
-					#printf '%s\n' "${actual[@]}"
-					new_xml_content="${new_xml_content}"$(printf '%s\n' "${actual[@]}")$'\n'
-				fi
-				# if we had a hit, reset all
-				i=-1
-				url=false
-				hit=false
-				unset actual
-			fi
-
-		done < "${current_map}"
-
-		# finally, add all the remaining elements which are post the last </url>
-		# the last actual array contains all lines after the last </url>
-		new_xml_content="${new_xml_content}"$(printf '%s\n' "${actual[@]}")
-
-		# create a disallow entry for the particular product containig non- "next" entries
-		unset actual
-		add_robot_content= #$'\n'
-		for actual in "${unuseable_releases[@]}"
-		do
-			#echo $actual
-			url_component_ok=/"${product_name}"/"${actual}"/*
-#
-# NOTE the following script line is temporarily commented.
-#
-# Google can't reindex a site where robots.txt disallows urls which are, when accessed, redirected to a disallowed
-# page, even robots.txt is valid and does not disallow the site itself.
-#
-# robots.txt --> disallow: /server/10.8/*
-# docs.owncloud.com --> docs.owncloud.com/server/latest --> docs.owncloud.com/server/10.8 (** bang **)
-#
-# when server gets its own repository and the main site entry does not automatically redirect to server/latest,
-# this comment can be removed and the disallow is added again.
-#
-#			add_robot_content="${add_robot_content}""Disallow: ""${url_component_ok}"$'\n'
-		done
-
-		# echo the output in debug mode but do not do anything else
-		# make the headline green for better identification of the current xml file
-		if [[ "$DRY_RUN" == true ]]; then
-			echo
-			echo -e "\e[1;32m${current_map} \e[0m"
-			echo
-			echo "${new_xml_content}"
-			echo
-			echo -e "\e[1;32mTo be added to ${HTML_ROOT}/robots.txt\e[0m"
-			echo "${add_robot_content}"
-		else
-			# write the changes to the files
-			# backup the current sitemap file
-			mv "${current_map}" "${current_map}${BACKUP_EXT}"
-			# create a new sitemap file and write the new contents
-			cat <<< "$new_xml_content" > "$current_map"
-			# append the contents to the new robots file
-			cat <<< "$add_robot_content" >> "${current_robot}"
-		fi
-
-		# for testing to make only one block
-#		break
-	done
-
-	# remove any emtpy lines at the END of the script
-	# this keeps the content compact if not adding disallows
-	sed -i -e :a -e '/^\n*$/{$d;N;ba' -e '}' "${current_robot}"
-
-}
-
-# get the header lines of the current sitemap file
-function get_sitemap_header()
-{
-	# example
-	# <?xml version="1.0" encoding="UTF-8"?>
-	# <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
-	local sh=
-	sh=$(sed '/<url>/Q' "$1")
-	echo "${sh}"
-}
-
-# get all unusabe releases of the current sitemap file
-function get_unuseable_releases()
-{
-	local a=()
-	local ur=()
-
-	# make an array of the default name to intersect with the existing names
-	local name=( "${DEFAULT_NAME}" )
-
-	ur=($(grep -oP '(?<=loc>)[^<]+' "$1" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | cut -d \/ -f 2 | sort | uniq))
-
-	# intersect the arrays
-	# the array must be returned in this case with printf and not with echo
-	readarray -t a < <(echo ${ur[@]} ${name[@]} | tr ' ' '\n' | sort | uniq -d | xargs echo ${ur[@]} | tr ' ' '\n' | sort | uniq -u)
-	printf '%s\n' "${a[@]}"
-}
-
-# get the product name of a particular sitemap file
-function get_product_name()
-{
-	local pn=
-	pn=($(grep -oP '(?<=loc>)[^<]+' "$1" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | cut -d \/ -f 1 | uniq))
-	echo "${pn}"
-}
-
-# query the URL of the docs site.
-# this can also be a local address (e.g. ip:port) if you have built e.g. with "yarn antora-local".
-# will be used to identify a deletable <url> block in a sitemap file
-function get_doc_www_root()
-{
-	DOC_WEB_ROOT=($(grep -oP '(?<=loc>)[^<]+' "$1" | cut -d/ -f1-3 |uniq))
-	#echo "${DOC_WEB_ROOT}"
-}
-
-# get_sitemaps <main_sitemap_file> <list_of_allowed_products_to_process>
-# get all sitemap files which are available and match the product array list
-# using parameters helps customizing the function easily
-function get_sitemaps()
-{
-	local a=()						# helper variable
-	local sitemap_base_file="$1"	# Save first argument in a variable 
-	shift							# Shift all arguments to the left (original $1 gets "lost")
-	local b=("$@")					# Rebuild the products array with rest of arguments
-	local found_sitemaps=()			# the list of sitemap files found 
-	
-	# append .xml to each availabe product and sort the outcome
-	a=( "${b[@]/%/.xml}" )
-	readarray -t available_products < <(printf '%s\n' "${a[@]}" | sort)
-	#printf '%s\n' "${available_products[@]}"
-
-	# get the list of referenced sitemaps from the main sitemap file and store it into an array
-	# the name is stored without the extension ".xml"
-	# the files are defined inbetween the <loc> </loc> tags
-	# for matching the arrays, we need to remove the leading "sitemap-" string
-	readarray -t found_sitemaps < <(grep -oP '(?<=loc>)[^<]+' "${sitemap_base_file}" | grep -Po '\w\K/\w+[^?]+' | cut -c 2- | sed 's/sitemap-//g' | sort)
-	#printf '%s\n' "${found_sitemaps[@]}"
-
-	# Intersection of possible and available sitmap files
-	readarray -t a < <(echo ${available_products[@]} ${found_sitemaps[@]} | tr ' ' '\n' | sort | uniq -d)
-	a=( "${a[@]/#/sitemap-}" )
-	#printf '%s\n'  "${a[@]}"
-	SITEMAPS_TO_USE=( "${a[@]/#/${HTML_ROOT}/}" )
-	#printf '%s\n' "${SITEMAPS_TO_USE[@]}"
-}
-
-# rename all files with the extension ${BACKUP_EXT} back to its original extension.
-# this is only when playing around and you do not want to manually type all the commands 
-function revert()
-{
-	local f=
-
-	echo
-	# find all files in the html root dir, and only there, where the extension is backup_ext
-	for f in `find "${HTML_ROOT}" -maxdepth 1 -name "*${BACKUP_EXT}" -type f`; do
-		# move the file by removing the backup extension
-		mv -- "${f}" "${f%${BACKUP_EXT}}"
-		echo -e "\e[1;32mReverting file: \e[0m" "${f}"
-		#echo "${f%${BACKUP_EXT}}"
-		#echo "$f"
-	done
-	# if f is empty there was nothing found to be reverted
-	if [ -z "$f" ]; then
-		echo -e "\e[1;32mNothing found to be reverted \e[0m" "${f}"
-	fi
-	echo
-}
-
-function revert_and_run()
-{
-	revert
-	run
-}
-
-function usage()
-{
-	echo
-	echo "Usage: bin/optimize_crawl [-h] [-e] [-x] [-d] [-r]"
-	echo
-	echo "-h ... help"
-	echo "-e ... Execute (add debug mode for a dry run)"
-	echo "-x ... Revert first and Execute (add debug mode for a dry run)"
-	echo "-d ... Debug mode, print only of the optimized sitmap content, no saving."
-	echo "-r ... Revert backuped files to originals if exists."
-	echo
-}
-
-while getopts ":rdexh:" o
-do
-	case ${o} in
-		r)
-			ACTION="REVERT"
-			;;
-		d )
-			DRY_RUN=true
-			;;
-		e)
-			ACTION="RUN"
-			;;
-		x)
-			ACTION="REVERT_AND_RUN"
-			;;
-		h | * )
-			ACTION="HELP"
-			;;
-	esac
-done
-
-shift $((OPTIND-1))
-
-case "$ACTION" in
-	REVERT)
-		revert
-		;;
-	RUN)
-		run
-		;;
-	REVERT_AND_RUN)
-		revert_and_run
-		;;
-	HELP | *)
-		usage
-		exit $ERR_UNSUPPORTED_ACTION
-		;;
-esac
diff --git a/ext-antora/sitemap-cleanup.js b/ext-antora/sitemap-cleanup.js
new file mode 100644
index 0000000000..f9929281e5
--- /dev/null
+++ b/ext-antora/sitemap-cleanup.js
@@ -0,0 +1,320 @@
+"use strict"
+
+/**
+ * Cleanup sitemap files - remove any links that should not be kept - SEO and search relevant 
+ * Version 1.0.0
+ * 
+ * @param {Object} playbook configuration object      - The configuration object for Antora
+ * @param {Object} config   configuration object      - Configuration provided by the playbook
+ * @param {Object}          playbook.site             - Site-related configuration data.
+ * @param {String}          playbook.site.url         - The base URL of the site.
+ * @param {Object}          playbook.output           - Output-related configuration data.
+ * @param {String}          playbook.output.dir       - The output directory of the build.
+ * @param {Array<string>}   [config.validsegments]    - Segments to keep in sitemap files ['next', 'latest', ...]
+ * @param {<string>}        [config.preferredsegment] - Optional, segment (singular) to keep if multiple segments are present
+ */
+
+const fs    = require('fs')
+const smf = 'sitemap.xml'
+const sitemap_keyword = 'sitemap-'
+
+module.exports.register = function ({ config }) {
+    this.on("sitePublished", async ({ playbook, publications }) => {
+
+      // get relevant Antora data
+
+      const logger = this.getLogger('sitemap-cleanup')
+      const siteUrl = playbook.site.url
+      const outputDir = publications[0].resolvedPath
+
+      // read playbook config data
+
+      let validSegments = []
+      if(config.validsegments) {
+        config.validsegments.forEach(element => validSegments.push(element))
+        //validSegments.forEach(element => logger.warn(element))
+      } else {
+        console.log()
+        logger.warn('Config: No valid segment(s) found to keep for sitemaps. Exiting \n')
+        this.stop()
+      }
+
+      // note that preferredsegment is a string and not an array in the playbook
+      // if omitted, validSegments are used
+      let preferredSegments = validSegments
+      if(config.preferredsegment) {
+        if (validSegments.includes(config.preferredsegment)) {
+          preferredSegments = []
+          preferredSegments.push(config.preferredsegment)
+        } else {
+          console.log()
+          logger.warn('Config: preferredsegment, if set, must match one of the validsegments. Exiting \n')
+          this.stop()
+        }
+      }
+
+      // sitemaps processed will be printed to the console
+      let printSitemapFound = null
+      if(config.printsitemapfound) {
+        printSitemapFound = true
+      } else {
+        printSitemapFound = false
+      }
+
+      // changed content of sitemaps processed will be printed to the console
+      let printContent = null
+      if(config.printcontent) {
+        printContent = true
+      } else {
+        printContent = false
+      }
+
+      // check if there is a sitemap file present and load its content
+      // return to Antora if no sitemap file is found
+
+      var content = null
+      const main_sitemap_file = outputDir + '/' + smf
+      try {
+        content = await get_file_from_local(main_sitemap_file)
+      } catch (e) {
+        console.log('\n')
+        logger.warn('There is an error accessing the ' + smf + ' file. Continuing with Antora')
+        if (e.code === 'ENOENT') {
+          logger.warn(smf + ' not found at:')
+          logger.warn(main_sitemap_file)
+        } else {
+          logger.warn(e)
+        }
+        console.log('\n')
+        return
+      }
+
+      // based on the content of the main sitemap file, check which ones to parse
+      // return to Antora if no parsable tags are found: (</url> | </sitemap>
+
+      var parsable_sitemaps = []
+      if (content.includes('</url>')) {
+        // the sitemap file itself uses url tags which only occurs when no sub-sitemaps are used 
+        parsable_sitemaps.push(outputDir + '/' + smf)
+      } else if (content.includes('</sitemap>')) {
+        // check for sub-sitemaps
+        parsable_sitemaps = get_parsable_sitemap_files (content, siteUrl + '/', outputDir + '/')
+        if (!parsable_sitemaps.length) {
+          console.log('\n')
+          logger.warn('No parseable sub-sitemaps found in '+ smf + ' Continuing with Antora')
+          console.log('\n')
+        return
+        }
+      } else {
+        // nothing found at all, something must went wrong with the sitemap created by Antora
+        console.log('\n')
+        logger.warn('No parseable content found in '+ smf + ' Continuing with Antora')
+        console.log('\n')
+        return
+      }
+      
+      // parse each sitemap, replace content on matches and save the result:
+      // remove any entries that are NOT defined via config variables
+
+      console.log()
+      parsable_sitemaps.forEach(element => parse_sitemap_file(parsable_sitemaps.length, outputDir, content, 
+         siteUrl, validSegments, preferredSegments, printSitemapFound, printContent, logger, element))
+
+    })
+
+}
+
+/**
+ * parse the sitemap file that is handed over and save any changes back
+ * 
+ * @param  {integer} total_sitemaps       the number of sitemap files found
+ * @param  {string}  content              the content of sitemap.xml
+ * @param  {string}  site_url             the siteURL to build for
+ * @param  {string}  output_dir           the directory where the sitemap files are located
+ * @param  {bool}    print_sitemap_found  print the sitemap file component name
+ * @param  {bool}    print_content        print the changed sitemap file content
+ * @param  {array}   valid_segments       array of segments that are technically valid
+ * @param  {array}   preferred_segments   array of segments that will be  more than one valid_segments
+ * @param  {object}  logger               the logger object
+ * @param  {string}  file                 the sitemap file to process
+ * @return                                no data returnd
+ */
+async function parse_sitemap_file (total_sitemaps, output_dir, content, site_url, valid_segments,
+  preferred_segments, print_sitemap_found, print_content, logger, file) {
+
+    // get the name of the component from the filename or the contents of the file
+    // the component name is important to properly strip off all data from the left that is not required 
+    // for multi version components, this will always be a component name
+    // for single version components: either empty || a component name
+    var component = ''
+    if (total_sitemaps > 1) {
+      // because there are more sitemap files, the component name can be derived from the filename
+      component = file.replace(output_dir + '/', '').replace(sitemap_keyword, '').replace('.xml', '')
+    } else {
+      // because there is only one sitemap file, the component name must be gathered from the contents
+      const first_match = content.match(/<loc>([^<]*)<\/loc>/)
+      const temp_string = first_match[0].replace('<loc>','').replace('</loc>','').replace(site_url + '/','')
+      const index = temp_string.indexOf('/')
+      component = temp_string.substring(0, index)
+    }
+
+    // if the compoentn name is empty, there is nothing to do, because it is a versionless ROOT component
+    if (component.length === 0) return
+
+    // get the contents of the sitemap file
+    try {
+      var sitemap_content = await get_file_from_local(file)
+    } catch (e) {
+      console.log('\n')
+      logger.warn('Cant access file. Continuing with next sitemap file.')
+      if (e.code === 'ENOENT') {
+        logger.warn(file + ' not found')
+      } else {
+        logger.warn(e)
+      }
+      console.log('\n')
+      return
+    }
+
+    // print the component name of the sitemap file found
+    if (print_sitemap_found) {
+     logger.warn('Processing component: ' + component)
+    }
+
+    // get all multiline blocks with <url> ... </url> 
+    const all_url_matches = sitemap_content.match(/(?:<url>)[\s\S]*?(?:<\/url>)/gm)
+
+    // if there is no <url> ... </url> structure, return, nothing needs to be done for this file
+    if (all_url_matches.length === 0) {
+      if (print_sitemap_found) {
+        logger.warn('No processable data found, continuing')
+      }
+      return
+    }
+    
+
+    var i = null
+    var hit = null
+    var version = null
+    var has_changed = false
+
+    for (i=0; i < all_url_matches.length; i++) {
+      hit = true
+      // extract the version from the url. here, a location always has a version. no version case is sorted out above
+      // <loc>http://localhost:8080/component/version/file.html</loc>
+      version = all_url_matches[i].match(/<loc>([^<]*)<\/loc>/gm).toString()
+      version = version.replace('<loc>','').replace('</loc>', '').replace(site_url + '/', '').replace(component + '/', '')
+      version = version.substring(0, version.indexOf("/"))
+
+      // check if there is a miss. version found does not match the version allowed
+      if (valid_segments.includes(version)) {
+        if (preferred_segments.includes(version)) {
+          hit = false
+          has_changed = true
+        }
+      }
+
+      // the block identified needs to be removed
+      if (hit) {
+        sitemap_content = sitemap_content.replace(all_url_matches[i], '')
+      }
+    }
+
+    // if nothing has changed, return
+    if (!has_changed) {
+      if (print_sitemap_found) {
+        logger.warn('The content of the sitemap was not changed')
+      }
+      return
+    }
+
+    // remove all blank lines caused by the removal process
+    sitemap_content = removeEmptyLinesRegex(sitemap_content)
+
+    // print the content of the sitemap file
+    if (print_content) {
+      logger.warn('New content:')
+      console.log(sitemap_content)
+    }
+
+    // write the changed content back to file 
+    // on error, we cant predict most common ones, so we print the complete message 
+    try {
+      fs.writeFileSync(file, sitemap_content, {encoding: 'utf8', mode: 0o664})
+    } catch (e) {
+      console.log(e)
+    }
+
+    if (print_sitemap_found) {
+      logger.warn('New sitemap data successfully written back')
+    }
+
+    return
+}
+
+/**
+ * return the string content with removed blank lines
+ * 
+ * @param  {string} content         string that contains possibly blank lines
+ * @return {array}  result          string that has t 
+ * 
+ */
+function removeEmptyLinesRegex(content) {
+  // Regex explanation:
+  // ^: Matches the beginning of a line
+  // \s*: Matches zero or more whitespace characters (space, tab, newline, etc.)
+  // $: Matches the end of a line
+  // /gm: Global (g) and multiline (m) flags
+  //    'g' ensures all matches are replaced, not just the first.
+  //    'm' ensures '^' and '$' match the start/end of each line, not just the entire string.
+  return content.replace(/^\s*[\r\n]/gm, '').replace(/(\r\n|\r|\n){2,}/g, '$1');
+}
+
+/**
+ * return an array with parsable sitemap files
+ * 
+ * @param  {string} content         the content of sitemap.xml
+ * @param  {string} site_url        the siteURL to build for
+ * @param  {string} output_dir      the directory where the sitemap files are located
+ * @return {array}  result          a list of full-path sitmap files to parse
+ */
+function get_parsable_sitemap_files (content, site_url, output_dir) {
+// https://www.linkedin.com/pulse/parsing-xml-javascript-python-sergiu-panaite-1
+    const all_matches = content.match(/<loc>([^<]*)<\/loc>/g)
+    // all_matches.forEach(element => console.log(element))
+
+    // exit if no sub-sitemaps can be found, then the array is not initialized
+    if (!Array.isArray(all_matches)) { return [] }
+
+    // strip off the xml tag and the site_url
+    var i = null
+    var temp_res = []
+    for (i=0; i < all_matches.length; i++) {
+      temp_res[i] = all_matches[i].replace('<loc>','').replace('</loc>','').replace(site_url,'')
+    }
+    //temp_res.forEach(element => console.log(element))
+
+    // add the path
+    const result = temp_res.map(element => `${output_dir}${element}`)
+    // result.forEach(element => console.log(element))
+
+    return result
+}
+
+/**
+ * read the contents of a given file
+ * 
+ * @param  {string} file        the file to read
+ * @return {string} data        the data from the file read
+ * @return {object} error       the error if thrown, needs try/catch from the caller
+ */
+function get_file_from_local(file) {
+    // promise a file from local filesystem
+    // when executed it returns the contents if found
+    return new Promise((resolve, reject) => {
+      fs.readFile(file, 'utf8', (error, data) => {
+        if (error) reject(error)
+        resolve(data)
+      })
+    })
+}
diff --git a/site.yml b/site.yml
index 6a9437e227..ca5966e608 100644
--- a/site.yml
+++ b/site.yml
@@ -85,6 +85,12 @@ antora:
       #attributefile: https://raw.githubusercontent.com/owncloud/docs/refs/heads/master/global-attributes.yml
       attributefile: ./global-attributes.yml
       enabled: true
+    - require: ./ext-antora/sitemap-cleanup.js
+      validsegments: ['next', 'latest']
+      preferredsegment: latest
+      printsitemapfound: true
+      printcontent: false
+      enabled: true
     # for testing only, prints out attributes used
     # use only one or the other, output can be big
     #- ./ext-antora/attributes-used-in-site-yml.js