import React, { useEffect, useState } from 'react';
import nlp from 'compromise';

const STOP_WORDS = new Set([
  'and', 'the', 'to', 'of', 'in', 'a', 'for', 'with', 'that', 'you', 'we', 'i',
  'is', 'are', 'be', 'will', 'can', 'your', 'our', 'it', 'this', 'these', 'those',
  'they', 'them', 'their', 'at', 'by', 'from', 'on', 'or', 'but', 'not', 'an', 'as',
  'what', 'when', 'where', 'who', 'which', 'why', 'how', 'all', 'any', 'both', 'each',
  'few', 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'just',
  'should', 'now', 'also', 'may', 'must', 'has', 'have', 'had', 'was', 'were',
  'been', 'being', 'into', 'if', 'then', 'else', 'out', 'about', 'over', 'again',
  'once', 'under', 'further', 'before', 'after', 'above', 'below', 'up', 'down',
  'during', 'while', 'through', 'ramp', 'whether', 'include', 'article', 'providing',
  'unique', 'including', 'understanding', 'need', 'needs', 'our', 'your', 'his', 'her',
  'its', 'their', 'here', 'there', 'where', 'when', 'why', 'how', 'all', 'any', 'both', 'each',
  'few', 'more', 'most', 'other', 'some', 'such', 'than', 'too', 'very', 'just',
  'should', 'now', 'also', 'may', 'must', 'has', 'have', 'had', 'was', 'were',
  'been', 'being', 'into', 'if', 'then', 'else', 'out', 'about', 'over', 'again',
  'once', 'under', 'further', 'before', 'after', 'above', 'below', 'up', 'down',
  'during', 'while', 'through', 'ramp', 'make', 'physio', 'physiotherapy', 'therapy'
]);

// words that should not be processed to singular
const PROTECTED_WORDS = new Set([
  'ndis',
]);

const TextProcessor = ({ onWordsProcessed }) => {
  useEffect(() => {
    const processPageText = () => {
      try {
        // const response = await fetch('/wordcloud_text.txt');
        // const text = await response.text();
        
        // Get text content from specific HTML elements while excluding certain sections
        const targetElements = document.querySelectorAll('h1, h2, h3, h4, h5, h6, p, ul li');
        const text = Array.from(targetElements)
          .filter(element => {
            // Exclude elements from specific sections and word cloud
            return !element.closest('.word-cloud-container') &&
                   !element.closest('.header') &&
                   !element.closest('.health-section') &&
                   !element.closest('.testimonial-section') &&
                   !element.closest('.footer-bar');
          })
          .map(element => element.textContent.trim())
          .filter(text => text.length > 0)
          .join(' ');
        
        const doc = nlp(text);
        
        // Extract relevant terms using compromise
        const terms = doc.match('#Noun+')  // Get noun phrases
          .concat(doc.match('#Verb'))      // Add verbs
          .concat(doc.match('(physio|physiotherapy|therapy|rehabilitation|exercise|training|fitness|workout|gym|health|wellness|condition|injury|pain|muscle|joint|spine|treatment|recovery)*'))
          .concat(doc.match('(back pain|neck pain|shoulder pain|knee pain|ankle pain|sports injury|postural correction|headache|migraine|arthritis|tendinopathy|sprain|strain|sciatica|disc bulge|tennis elbow|plantar fasciitis|rotator cuff|acl|mcl|rehabilitation|post surgery|chronic pain)*'))
          .concat(doc.match('(massage|dry needling|manual therapy|stretching|strengthening|mobility|flexibility|balance|coordination|posture|ergonomic|clinical pilates|sports specific|return to sport|prehab|rehab)*'))
          
          .not('#Adjective')               // Remove adjectives
          .not('#Pronoun')                 // Remove pronouns
          .not('#Punctuation')             // Remove punctuation 
          .terms()                         // Get individual terms
          .out('array');

        // Process terms and count frequencies
        const wordFreq = terms
          .map(term => {
            term = term.toLowerCase();
            if (PROTECTED_WORDS.has(term)) {
              return term;
            }
            const singular = nlp(term).nouns().toSingular().text();
            return singular || term;
          })
          .filter(term => {
            return term.length > 2 && 
                   !STOP_WORDS.has(term) &&
                   !/^\d+$/.test(term) &&
                   !/[{}()\[\]<>"'.,!?;:]/.test(term) &&
                   !term.includes('type=') &&
                   !term.includes('javascript') &&
                   !term.includes('www') &&
                   !term.includes('http') &&
                   !term.includes('com') &&
                   !term.includes('html');
          })
          .reduce((acc, word) => {
            if (acc[word]) {
              acc[word] += 1;
            } else {
              // Check for multi-word phrases
              const similarWord = Object.keys(acc).find(existingWord => {
                const existingLower = existingWord.toLowerCase();
                const wordLower = word.toLowerCase();
                
                // Check if words are part of a common phrase
                return (existingLower.includes(wordLower + ' ') || 
                        existingLower.includes(' ' + wordLower) ||
                        wordLower.includes(existingLower + ' ') ||
                        wordLower.includes(' ' + existingLower)) ||
                       // Or if they're the same word (ignoring plural/singular)
                       existingLower === wordLower;
              });
              
              if (similarWord) {
                acc[similarWord] += 1;
              } else {
                acc[word] = 1;
              }
            }
            return acc;
          }, {});

        const wordArray = Object.entries(wordFreq)
          .map(([text, value]) => ({ text, value }))
          .sort((a, b) => b.value - a.value)
          .slice(0, 100);
          
        onWordsProcessed(wordArray);
      } catch (err) {
        console.error('Error loading or processing text:', err);
      }
    };

    processPageText();
  }, [onWordsProcessed]);

  return null;
};

export default TextProcessor; 