Compare two HTML texts and mark differences-PHP Chinese Network Q&A

I want to mark the difference between two html texts, I found a solution, everything works as expected (deleted files receive class name "del", inserted text receives class name "ins" and related styles) only One thing is wrong - for some reason the text is marked in the wrong place.

var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, html_to_tokens, is_end_of_tag, is_start_of_tag, is_tag, is_whitespace, isnt_tag, op_map, define, recursively_find_matching_blocks, render_operations, wrap; is_end_of_tag = function (char) { return char === '>'; }; is_start_of_tag = function (char) { return char === '<'; }; is_whitespace = function (char) { return /^\s+$/.test(char); }; is_tag = function (token) { return /^\s*<[^>]+>\s*$/.test(token); }; isnt_tag = function (token) { return !is_tag(token); }; Match = class Match { constructor(start_in_before1, start_in_after1, length1) { console.log('lalala'); this.start_in_before = start_in_before1; this.start_in_after = start_in_after1; this.length = length1; this.end_in_before = this.start_in_before + this.length - 1; this.end_in_after = this.start_in_after + this.length - 1; } }; html_to_tokens = function (html) { var char, current_word, i, len, mode, words, letters; mode = 'char'; current_word = ''; // letters = []; words = []; for (i = 0, len = html.length; i < len; i++) { char = html[i]; // letters.push(char); switch (mode) { case 'tag': if (is_end_of_tag(char)) { current_word += '>'; words.push(current_word); current_word = ''; if (is_whitespace(char)) { mode = 'whitespace'; } else { mode = 'char'; } } else { current_word += char; } break; case 'char': if (is_start_of_tag(char)) { if (current_word) { words.push(current_word); } current_word = '<'; mode = 'tag'; } else if (/\s/.test(char)) { if (current_word) { words.push(current_word); } current_word = char; mode = 'whitespace'; } else if (/[\w\#@]+/i.test(char)) { current_word += char; } else { if (current_word) { words.push(current_word); } current_word = char; } break; case 'whitespace': if (is_start_of_tag(char)) { if (current_word) { words.push(current_word); } current_word = '<'; mode = 'tag'; } else if (is_whitespace(char)) { current_word += char; } else { if (current_word) { words.push(current_word); } current_word = char; mode = 'char'; } break; default: throw new Error(`Unknown mode ${mode}`); } } if (current_word) { words.push(current_word); } return words; }; find_match = function ( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after ) { var best_match_in_after, best_match_in_before, best_match_length, i, index_in_after, index_in_before, j, len, locations_in_after, looking_for, match, match_length_at, new_match_length, new_match_length_at, ref, ref1; best_match_in_before = start_in_before; best_match_in_after = start_in_after; best_match_length = 0; match_length_at = {}; for ( index_in_before = i = ref = start_in_before, ref1 = end_in_before; ref <= ref1 ? i < ref1 : i > ref1; index_in_before = ref <= ref1 ? ++i : --i ) { new_match_length_at = {}; looking_for = before_tokens[index_in_before]; locations_in_after = index_of_before_locations_in_after_tokens[looking_for]; for (j = 0, len = locations_in_after.length; j < len; j++) { index_in_after = locations_in_after[j]; if (index_in_after < start_in_after) { continue; } if (index_in_after >= end_in_after) { break; } if (match_length_at[index_in_after - 1] == null) { match_length_at[index_in_after - 1] = 0; } new_match_length = match_length_at[index_in_after - 1] + 1; new_match_length_at[index_in_after] = new_match_length; if (new_match_length > best_match_length) { best_match_in_before = index_in_before - new_match_length + 1; best_match_in_after = index_in_after - new_match_length + 1; best_match_length = new_match_length; } } match_length_at = new_match_length_at; } if (best_match_length !== 0) { match = new Match( best_match_in_before, best_match_in_after, best_match_length ); } return match; }; recursively_find_matching_blocks = function ( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after, matching_blocks ) { var match; match = find_match( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after ); if (match != null) { if ( start_in_before < match.start_in_before && start_in_after < match.start_in_after ) { recursively_find_matching_blocks( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, match.start_in_before, start_in_after, match.start_in_after, matching_blocks ); } matching_blocks.push(match); if ( match.end_in_before <= end_in_before && match.end_in_after <= end_in_after ) { recursively_find_matching_blocks( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, match.end_in_before + 1, end_in_before, match.end_in_after + 1, end_in_after, matching_blocks ); } } return matching_blocks; }; create_index = function (p) { var i, idx, index, len, ref, token; if (p.find_these == null) { throw new Error('params must have find_these key'); } if (p.in_these == null) { throw new Error('params must have in_these key'); } index = {}; const arr = html_to_tokens(p.find_these); const arr_number = []; for (var i = 0; i < arr.length; i++) { arr_number.push(arr[i].length); } ref = p.find_these; for (i = 0, len = ref.length; i < len; i++) { token = ref[i]; index[token] = []; idx = p.in_these.indexOf(token); while (idx !== -1) { index[token].push(idx); idx = p.in_these.indexOf(token, idx + 1); } } console.log(token, 'token'); console.log(arr_number, 'arr_number'); return index; }; find_matching_blocks = function (before_tokens, after_tokens) { var index_of_before_locations_in_after_tokens, matching_blocks; matching_blocks = []; index_of_before_locations_in_after_tokens = create_index({ find_these: before_tokens, in_these: after_tokens, }); console.log( index_of_before_locations_in_after_tokens, 'index_of_before_locations_in_after_tokens ' ); return recursively_find_matching_blocks( before_tokens, after_tokens, index_of_before_locations_in_after_tokens, 0, before_tokens.length, 0, after_tokens.length, matching_blocks ); }; calculate_operations = function (before_tokens, after_tokens) { var action_map, action_up_to_match_positions, i, index, is_single_whitespace, j, last_op, len, len1, match, match_starts_at_current_position_in_after, match_starts_at_current_position_in_before, matches, op, operations, position_in_after, position_in_before, post_processed; if (before_tokens == null) { throw new Error('before_tokens?'); } if (after_tokens == null) { throw new Error('after_tokens?'); } position_in_before = position_in_after = 0; operations = []; action_map = { 'false,false': 'replace', 'true,false': 'insert', 'false,true': 'delete', 'true,true': 'none', }; matches = find_matching_blocks(before_tokens, after_tokens); console.log(matches, 'matches'); matches.push(new Match(before_tokens.length, after_tokens.length, 0)); for (index = i = 0, len = matches.length; i < len; index = ++i) { match = matches[index]; match_starts_at_current_position_in_before = position_in_before === match.start_in_before; match_starts_at_current_position_in_after = position_in_after === match.start_in_after; action_up_to_match_positions = action_map[ [ match_starts_at_current_position_in_before, match_starts_at_current_position_in_after, ].toString() ]; if (action_up_to_match_positions !== 'none') { operations.push({ action: action_up_to_match_positions, start_in_before: position_in_before, end_in_before: action_up_to_match_positions !== 'insert' ? match.start_in_before - 1 : void 0, start_in_after: position_in_after, end_in_after: action_up_to_match_positions !== 'delete' ? match.start_in_after - 1 : void 0, }); } if (match.length !== 0) { operations.push({ action: 'equal', start_in_before: match.start_in_before, end_in_before: match.end_in_before, start_in_after: match.start_in_after, end_in_after: match.end_in_after, }); } position_in_before = match.end_in_before + 1; position_in_after = match.end_in_after + 1; } post_processed = []; last_op = { action: 'none', }; is_single_whitespace = function (op) { if (op.action !== 'equal') { return false; } if (op.end_in_before - op.start_in_before !== 0) { return false; } return /^\s$/.test( before_tokens.slice(op.start_in_before, +op.end_in_before + 1 || 9e9) ); }; for (j = 0, len1 = operations.length; j < len1; j++) { op = operations[j]; if ( (is_single_whitespace(op) && last_op.action === 'replace') || (op.action === 'replace' && last_op.action === 'replace') ) { last_op.end_in_before = op.end_in_before; last_op.end_in_after = op.end_in_after; } else { post_processed.push(op); last_op = op; } } return post_processed; }; consecutive_where = function (start, content, predicate) { var answer, i, index, last_matching_index, len, token; content = content.slice(start, +content.length + 1 || 9e9); // console.log(content, 'content'); // console.log(predicate, 'predicate'); // console.log(start, 'start'); last_matching_index = void 0; for (index = i = 0, len = content.length; i < len; index = ++i) { token = content[index]; answer = predicate(token); if (answer === true) { last_matching_index = index; } if (answer === false) { break; } } if (last_matching_index != null) { return content.slice(0, +last_matching_index + 1 || 9e9); } return []; }; wrap = function (tag, content) { var length, non_tags, position, rendering, tags; rendering = ''; position = 0; length = content.length; while (true) { if (position >= length) { break; } non_tags = consecutive_where(position, content, isnt_tag); position += non_tags.length; if (non_tags.length !== 0) { rendering += `<${tag}>${non_tags.join('')}`; } if (position >= length) { break; } tags = consecutive_where(position, content, is_tag); position += tags.length; rendering += tags.join(''); } return rendering; }; op_map = { equal: function (op, before_tokens, after_tokens) { return before_tokens .slice(op.start_in_before, +op.end_in_before + 1 || 9e9) .join(''); }, insert: function (op, before_tokens, after_tokens) { var val; const token_length = after_tokens.map(element => element.length); val = after_tokens.slice(op.start_in_after, op.end_in_after + 1 || 9e9); return wrap('ins', val); }, delete: function (op, before_tokens, after_tokens) { var val; val = before_tokens.slice(op.start_in_before, +op.end_in_before + 1 || 9e9); return wrap('del', val); }, }; op_map.replace = function (op, before_tokens, after_tokens) { return ( op_map.delete(op, before_tokens, after_tokens) + op_map.insert(op, before_tokens, after_tokens) ); }; render_operations = function (before_tokens, after_tokens, operations) { var i, len, op, rendering; rendering = ''; console.log(operations.length, 'operations.length'); console.log(operations, 'operations'); for (i = 0, len = operations.length; i < len; i++) { op = operations[i]; rendering += op_map[op.action](op, before_tokens, after_tokens); } return rendering; }; diff = function (before, after) { var ops; if (before === after) { return before; } before = html_to_tokens(before); after = html_to_tokens(after); ops = calculate_operations(before, after); return render_operations(before, after, ops); }; diff.html_to_tokens = html_to_tokens; diff.find_matching_blocks = find_matching_blocks; find_matching_blocks.find_match = find_match; find_matching_blocks.create_index = create_index; diff.calculate_operations = calculate_operations; diff.render_operations = render_operations; if (typeof define === 'function') { define([], function () { return diff; }); } else if (typeof module !== 'undefined' && module !== null) { module.exports = diff; } else { this.htmldiff = diff; }

Call the code I wrote:

let oldTxt = Match.html_to_tokens(a); let newTxt = Match.html_to_tokens(b); let ops = Match.calculate_operations(a, b); const render = Match.render_operations(newTxt, oldTxt, ops); document.getElementById('output').innerHTML = render;

a is the first html, b is the changed html (not stringified)

Do I have any ideas on how to proceed? Thank you for your help!

I know the error is somewhere inside the "create_index" function. Because I'm calculating the length of the text and then comparing it to the length of the text, each tag/word counts as one.

Code Sandbox