', buffer: '...', has_direct_content: bool } @div_stack = [] @skip_div = 0 # Counter for divs beyond MAX_DIV_DEPTH that we're skipping end # This is the main entry point for parsing a string to remove deeply nested divs. # # @param string [String] The HTML string to parse. # @param chunk [Symbol] :fragment or :document to indicate parsing mode. def self.parse(string, chunk: :fragment) handler = new(chunk:) parser = Nokogiri::HTML::SAX::Parser.new(handler) parser.parse(string) handler.out end def start_element(name, attrs = []) # Skip html/body wrapper tags added by SAX parser for fragments return if chunk != :document && %w[html body].include?(name) if name == 'div' # Hard limit: skip divs beyond MAX_DIV_DEPTH entirely if @div_stack.size >= MAX_DIV_DEPTH @skip_div += 1 return end # Push a new frame - don't write the div yet, we'll decide on close @div_stack.push({ tag: build_tag(name, attrs), buffer: +'', has_direct_content: attrs.any? # divs with attributes are considered "meaningful" }) else # Non-div element: mark current div as having direct content and write to buffer mark_has_direct_content current_buffer << build_tag(name, attrs) end end def end_element(name) # Skip html/body wrapper tags added by SAX parser for fragments return if chunk != :document && %w[html body].include?(name) if name == 'div' # Handle closing tags for skipped divs if @skip_div.positive? @skip_div -= 1 return end return if @div_stack.empty? frame = @div_stack.pop if frame[:has_direct_content] # This div has direct content - include it with wrapper write_to_parent("#{frame[:tag]}#{frame[:buffer]}