Fix Mixed Encoding

By Ilya Dogolazky Last update Feb 3, 2008 — Installed 67 times. Daily Installs: 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
// Fix Mixed Encoding 0.0.2 (alpha) 2008-02-03
// ------------------------------------------
// Copyright (c) 2008, Ilya Dogolazky
// Released under the GPL license, see http://www.gnu.org/copyleft/gpl.html for details
// ------------------------------------------
// ==UserScript==
// @name           Fix Mixed Encoding
// @namespace      http://www.math.uni-bonn.de/people/ilyad/gm/mixed
// @description    Fixes "mix-encoded" pages containg Cyrillic letters in KOI8-R and punctuation symbols in CP-1251 by recoding the latter. See some examples in the full description. The script is not very fast. Don't use it script unless you know what are you doing.
// @include        *
// ==/UserScript==

const ukwu = /* automatically generated by the Perl script below */
{
0x2500: 0x0402, 0x2502: 0x0403, 0x250C: 0x201A, 0x2510: 0x0453, 0x2514: 0x201E, 0x2518: 0x2026, 0x251C: 0x2020, 0x2524: 0x2021,
0x252C: 0x20AC, 0x2534: 0x2030, 0x253C: 0x0409, 0x2580: 0x2039, 0x2584: 0x040A, 0x2588: 0x040C, 0x258C: 0x040B, 0x2590: 0x040F,
0x2591: 0x0452, 0x2592: 0x2018, 0x2593: 0x2019, 0x2320: 0x201C, 0x25A0: 0x201D, 0x2219: 0x2022, 0x221A: 0x2013, 0x2248: 0x2014,
/* skip 0x98 */ 0x2265: 0x2122, /* skip 0x9A */ 0x2321: 0x203A, 0x00B0: 0x045A, 0x00B2: 0x045C, /* skip 0x9E */ 0x00F7: 0x045F,
0x2550: 0x00A0, 0x2551: 0x040E, 0x2552: 0x045E, /* skip 0xA3 */ 0x2553: 0x00A4, 0x2554: 0x0490, 0x2555: 0x00A6, 0x2556: 0x00A7,
0x2557: 0x0401, 0x2558: 0x00A9, 0x2559: 0x0404, 0x255A: 0x00AB, 0x255B: 0x00AC, 0x255C: 0x00AD, 0x255D: 0x00AE, 0x255E: 0x0407,
0x255F: 0x00B0, 0x2560: 0x00B1, 0x2561: 0x0406, /* skip 0xB3 */ 0x2562: 0x0491, 0x2563: 0x00B5, 0x2564: 0x00B6, 0x2565: 0x00B7,
0x2566: 0x0451, 0x2567: 0x2116, 0x2568: 0x0454, 0x2569: 0x00BB, 0x256A: 0x0458, 0x256B: 0x0405, 0x256C: 0x0455, /* skip 0xBF */
}

for each(var text in xpath_list("//text()[not(ancestor::script) and not(ancestor::style)]"))
  text.nodeValue = recode(text.nodeValue, utf8_koi8r_win1251_utf8) ;

function utf8_koi8r_win1251_utf8(code)
{
  return ukwu[code] || code ;
}

function recode(str, mapping)
{
  for(var i=0, res=""; i<str.length; ++i)
    res += String.fromCharCode(mapping(str.charCodeAt(i))) ;
  return res ;
}

function xpath_list(xpath, root, order)
{
  if(!root)
    root = window.document ;
  var result = [] ;
  var snapshot = document.evaluate(xpath, root, null, (order ? XPathResult.ORDERED_NODE_SNAPSHOT_TYPE : XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE), null) ;
  for(var i=0; i<snapshot.snapshotLength; ++i)
    result.push(snapshot.snapshotItem(i)) ;
  return result ;
}
/*
  use strict ;
  use warnings ;
  use Encode ;
  my %skip ;
  $skip{$_}=1 for(152, 154, 163, 179, 0x9E, 0xBF) ;
  for (0x80..0xBF)
  {
    if($skip{$_}) {
      printf "%s* skip 0x%02X *%s", "/", $_, "/" ;
    } else {
      printf "0x%04X: 0x%04X,", ord(decode("koi8-r",chr($_))), ord(decode("windows-1251",chr($_))) ;
    }
    print($_%8==7 ? "\n" : " ") ;
  }
*/

// vim:tw=0:smartindent