/**
 *
 * @param htmlString Raw html string
 * @returns A HTML string that has empty tags removed.
 */
const cleanseHtml = (htmlString: string) => {
  // Remove empty <img/> tags
  const imgPattern = /<img[^>]*\/?>/g
  htmlString = htmlString.replace(imgPattern, '')

  // Remove empty <span> tags
  const spanPattern = /<span[^>]*>\s*<\/span>/gi
  htmlString = htmlString.replace(spanPattern, '')

  // Remove empty <p> tags
  const pPattern = /<p[^>]*>\s*<\/p>/g
  htmlString = htmlString.replace(pPattern, '')

  // collapse <br/> tags
  const brPattern = /(<br\s*\/?>\s*)+/g
  htmlString = htmlString.replace(brPattern, '')

  return htmlString
}

const splitDocuments = (input: string) => {
  // Match 'document: ' only if it is not within HTML tags
  const regex = /document: "(.*?)", html content: "(<html>.*?<\/html>")/gs
  let matches
  const documents = []

  while ((matches = regex.exec(input)) !== null) {
    documents.push(`"${matches[1]}", html content: ${matches[2]}`)
  }

  return documents
}

export const documentsMatchInString = (docContentString: string) => {
  const cleansedHtml = cleanseHtml(docContentString)
  const documentSeparatedSegments = splitDocuments(cleansedHtml)

  // document sectioned html
  if (documentSeparatedSegments.length) {
    //   const contextList: { document?: string; text: string }[] = []
    return documentSeparatedSegments.reduce<Array<{ document: string; text: string }>>(
      (documentArr, currentDoc, index) => {
        const documentSegment = currentDoc.trim()
        const textSegment = currentDoc.trim()

        const documentTitleRegex = /(.*?)(?=,\shtml content:)/g
        const docMatch = documentTitleRegex.exec(documentSegment)
        const textMatch = textSegment.match(/<html>[\s\S]*?<\/html>/)

        if (docMatch && textMatch) {
          const documentValue =
            docMatch[1]?.trim()?.replace(/^"/, '')?.replace(/"$/, '').replace(/\n/g, ' ').replace(/\.$/, '') || ''
          const textValue = textMatch[0] || ''
          documentArr.push({ document: documentValue, text: textValue })

          return documentArr
        }

        return documentArr
      },
      []
    )
    // non documented html
  } else {
    const textSegment = cleansedHtml.trim()
    const textMatch = textSegment.match(/<html>[\s\S]*?<\/html>/)
    if (textMatch) {
      return [{ text: textMatch[0] || '' }]
    }

    return []
  }
}

export const getUniqueMatchesInContent = (availableDocuments: string[], contentToParse: string) => {
  // Now that we have a list of document object from the corpus, need to find out which ones are referenced within the message "content"
  const matchingDocumentsInContent = availableDocuments.reduce<string[]>((currentMatches, currentDocument) => {
    // sanitizing the document names, escaping brackets
    // eslint-disable-next-line no-useless-escape
    const escapedInputs = currentDocument.replace(/[\$\(\)\?\+\*\{\}\^\.\[\]\\\/]/g, '\\$&')

    // match inputs with square brackets only
    const regex = new RegExp(`\\[${escapedInputs}\\]`, 'iu') // Case-insensitive search

    if (regex.test(contentToParse)) {
      return [...currentMatches, currentDocument]
    }
    return currentMatches
  }, [])
  // Remove all duplicates
  const uniqueItemsSet = new Set(matchingDocumentsInContent)
  const uniqueItemsArray = Array.from(uniqueItemsSet)
  return uniqueItemsArray
}
