#!/usr/bin/perl
########################################################################
#
#	Universal filter of Russian encodings version 2.6a
#	created by Serge Winitzki (1997-2000). This is free software.
#	http://members.linuxstart.com/~winitzki/
#
#	Features:
#
#	- Self-contained perl script, does not require any Perl packages.
#	- Supported encodings: DOS CP-866 or "alternative" ('alt'),
#	ISO8859-5 ('iso'), KOI-8 ('koi'), Russkaja Latinica ('lat'),
#	Macintosh ('mac'), Unicode ('uni'), UTF-8 ('utf'), Windows CP-1252 ('win').
#	- Letters 'YO' and 'yo' correctly supported in all encodings.
#	- Strict 'Russkaja Latinica' conformance for the 'lat' encoding which allows
#	almost unambiguous repeated native<->latinized translations of Russian text.
#	- Determines the required encodings from invoked script's name
#	(if renamed alt2koi etc.) or from command line option.
#	- Supports KOI8-C extension for old Russian letters (Yat', Izhitsa, I, Fita)
#	The experimental KOI8-C encoding is a mix of KOI8-U and 1251.
#	Old Russian letters are only supported for WIN, KOI and UTF
#	encodings. (not in ALT, ISO, MAC!)
#	Recognizes copyright sign in Win, KOI (using KOI8-C), Mac.
#
#	Limitations:
#
#	- Unicode/UTF-8 support is preliminary and does not fully conform to the
#	official ISO8859-5 to Unicode cross-map (it works only for Cyrillic text).
#	- Unicode/UTF-8 support only recognizes Cyrillic Unicode characters, so results
#	on mixed cyrillic/other languages texts are most likely going to be wrong.
#	- Unicode/UTF-8 output options may not generate valid Unicode unless input is
#	strictly a mix of Cyrillic and normal ASCII letters (no "copyright"
#	characters, hyphens or anything like that).
#	- Old Russian support is only compatible with MY OWN "KOI8-C"
#	extension! Unicode symbols are generated and recognized in
#	UTF-8 mode only (i.e. -koi2utf, -utf2koi) and for letters only.
#	Converting text with these extra letters to any other encoding
#	will give wrong results. Fake ISO entries (0x80-0x9F) are generated
#	for internal purposes only. Win encoding of old Russian letters
#	corresponds to the font "Oldrus.TTF" (warning: the font does not
#	contain Copyright sign, therefore conversion KOI->Win->Koi will
#	lose Capital Izhitsa!)
#
#	Command line options (all options are case-insensitive):
#	e.g. -alt2koi or -mac2win or whatever	select required encodings
#	-bylines	convert text line by line instead of loading all into memory
#
#	Options for lat -> ... conversion:
#	-tex	do not translate text inside $..$, $$..$$ and \command names
#	-wisv	translate w as v (default w is tshcha)
#	-qisja	translate q as ja (default q is tshcha)
#	-usekh	translate kh as h (default kh='k''h')
#	-oldrus	allow old Russian letters (Yat', Izhitsa, I, Fita)
#
#	Example usage (if this file is 323.pl in current directory):
#		perl 323.pl -wisv -lat2utf < inputfile > outputfile
#
########################################################################
#
#	Installation:
#	if needed, edit the first line to reflect your perl location (`which perl`);
#	put this script somewhere on the path with executable permission;
#	optionally make links to this script named alt2win, win2koi etc.
#	(The script can determine the source/target encoding from its *name*.)
#	e.g. copy this file to /usr/local/bin/323.pl and then say
#	cd /usr/local/bin; chmod 755 323.pl
#	ln -s 323.pl alt2koi; ln -s 323.pl koi2alt; and so on (optional) for all needed
#	combinations of alt, iso, koi, mac, win, lat. Names are case-insensitive.
#	After all this, use as a filter. For example, `lat2koi < file1 > file2`
#	or else have to specify encoding as `323.pl -lat2koi < file1 > file2`
#
############################# start of script ##########################
#
#	Direct native encodings: The tables may have unequal length, since
#	the translation only uses the initial parts of the tables. For instance,
#	non-Russian cyrillic symbols are only present in Win, ISO and KOI tables (and not all)
#	Symbols missing in a table are substituted by 0x7F
#	The order of letters must be otherwise the same in all tables:
#	\x7F (c) (No.)
#	ABVGDE Zh ZIJKLMNOPRSTUFHC Ch Sh Shch ~ Y ' E' Yu Ya
#	abvgde zh zijklmnoprstufhc ch sh shch ~ y ' e' yu ya
#	YO yo Iroman iroman YI yi IE ie Yat' yat' Fita fita Izhitsa izhitsa Psi psi
#

$rusmac="\x7F\xA9\xDC\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xDF\xDD\xDE\xA7\xB4\xBA\xBB\xB8\xB9";
$rusalt="\x7F\x7F\xFC\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\x49\x69\xF4\xF5\xF2\xF3";	# No Iroman/iroman, substituting I/i
$ruswin="\x7F\xA9\xB9\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF\xA8\xB8\xB2\xB3\xAF\xBF\xAA\xBA\x80\x90\xAA\xBA\xA1\xA2";	# old Russian for "oldrus.ttf", except yat' which is faked by the Serbian (0x80, 0x90), and no Belarussian
$ruskoi="\x7F\xBF\xB9\xE1\xE2\xF7\xE7\xE4\xE5\xF6\xFA\xE9\xEA\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFE\xFB\xFD\xFF\xF9\xF8\xFC\xE0\xF1\xC1\xC2\xD7\xC7\xC4\xC5\xD6\xDA\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDE\xDB\xDD\xDF\xD9\xD8\xDC\xC0\xD1\xB3\xA3\xB6\xA6\xB7\xA7\xB4\xA4\xB2\xA2\xBC\xAC\xB1\xA1\xBA\xAA";	# KOI8-C really
$rusiso="\x7F\x7F\xF0\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF\xA1\xF1\xA6\xF6\xA7\xF7\xA4\xF4\x82\x83\x92\x93\x94\x95\x90\x91";	# All codes outside of A0-FF range are not really ISO but our hack for old Russian letters.

#
####################### main part of the script ########################
#

$from="nothing";
$to="nothing";
$lat_output="no";	#whether latinized output is requested. special flag.
$lat_input="no";	#same for input
$uni_output="no";	#whether unicode output is requested
$uni_input="no";
$utf_output="no";	#whether UTF-8 output is requested
$utf_input="no";

$help='Universal converter of Russian encodings version 2.4
Created by Serge Winitzki, 1999. No warranty. This is free software.
http://www.geocities.com/CapeCanaveral/Lab/5735/1/

   Supported encodings: alt, iso, koi, lat, mac, uni, utf, win
   Example usage:

	323 -alt2koi < inputfile > outputfile

   Or rename/link to "xxx2xxx" where xxx is one of the supported encodings and

	alt2koi < inputfile > outputfile

   Note: latinized encoding "lat" is implemented according to the "Russkaja
Latinica" scheme. See http://www.geocities.com/Athens/Forum/5344/RL/ for
more details. Sample options for "lat" input:

	323 -lat2koi -usekh -wisv -qisja -tex < inputfile > outputfile

   See the script preamble for more information.
';

$all_enc="acfiklmnostuw";

if ("@ARGV" =~ /-([$all_enc]{3})2([$all_enc]{3})/i) {
	$a1=$1;
	$a2=$2;
	$error="Incorrect encoding '$a1 -> $a2' on command line.";
} else {
	#decide the source and target encoding based on our name
	$name=`basename $0`;
	if ($name =~ /([$all_enc]{3})2([$all_enc]{3})/i) {	#this should match koi2win etc.
	$a1="$1";
	$a2="$2";
	}
	$error="Incorrect usage of this script, see $0 for documentation.";
}

if ("@ARGV" =~ /help/i) {
	print $help . "\n";
	exit;
}


{
	if ($a1 =~ /win/i) {
		$from="$ruswin";
	} elsif ($a1 =~ /koi/i) {
		$from="$ruskoi";
	} elsif ($a1 =~ /alt/i) {
		$from="$rusalt";
	} elsif ($a1 =~ /mac/i) {
		$from="$rusmac";
	} elsif ($a1 =~ /iso/i) {
		$from="$rusiso";
	} elsif ($a1 =~ /uni/i) {
		$from="$rusiso";
		$uni_input="yes";
	} elsif ($a1 =~ /utf/i) {
		$from="$rusiso";
		$utf_input="yes";
	} elsif ($a1 =~ /lat/i) {
		$from="$ruskoi";	#this is because our latin table is for koi
		$lat_input="yes";
	}
	if ($a2 =~ /win/i) {
		$to="$ruswin";
	} elsif ($a2 =~ /koi/i) {
		$to="$ruskoi";
	} elsif ($a2 =~ /alt/i) {
		$to="$rusalt";
	} elsif ($a2 =~ /mac/i) {
		$to="$rusmac";
	} elsif ($a2 =~ /iso/i) {
		$to="$rusiso";
	} elsif ($a2 =~ /uni/i) {
		$to="$rusiso";
		$uni_output="yes";
	} elsif ($a2 =~ /utf/i) {
		$to="$rusiso";
		$utf_output="yes";
	} elsif ($a2 =~ /lat/i) {
		$to="$ruskoi";	#this is because our latin table is for koi
		$lat_output="yes";
   }

}

if ($to eq "nothing" or $from eq "nothing") {	#wrong options
	print "$error\n$0 -help for brief usage instructions.\n";
	exit 1;
}

if (not ("@ARGV" =~ /-bylines/i)) {
	undef $/;	#make it convert the whole file at once, usually much faster.
}

while(<STDIN>) {	#main loop

#effectively we want to do e.g.
# eval ("tr/$ruswin/$rusalt/");	#because tr requires constant strings


	if ($lat_input eq "yes") {
		&translate_lat_to_koi();	#call special procedure operating on $_
	} elsif ($uni_input eq "yes") {
		&translate_uni_to_iso();
	} elsif ($utf_input eq "yes") {
		&translate_utf_to_iso();
	}
	#now $_ contains all cyrillic text and we need to transform it
	# Main transformation. First check that $from is not longer than $to, then transform $_
	if (length($from)>length($to)) {
		$from=substr($from, 0, length($to));
	}
	eval ("tr/$from/$to/") unless ($from eq $to);	#we need to do this now
	#now $_ contains correctly transformed text
	if ($lat_output eq "yes") {
		&translate_koi_to_lat();	#call special procedure operating on $_
	} elsif ($uni_output eq "yes") {
		&translate_iso_to_uni();
	} elsif ($utf_output eq "yes") {
		&translate_iso_to_utf();
	}
	print;
}

#################### end of main part of the script ####################

sub translate_koi_to_lat {
#use this procedure to replace each character in $_

#using Russkaja Latinica standard (by Alexy Khabrov and Serge Winitzki, 1995)

#first, break digraphs Y-A, Y-U, Y-O - just in case we get them in the text although they are ungrammatical. Insert the canonical breaking char \\.

	s/([\xF9\xD9])([\xE1\xEF\xF5\xC1\xCF\xD5])/$1\\$2/g;

#also break the sh-ch which should rarely happen but still

	s/([\xFB\xDB])([\xFE\xDE])/$1\\$2/g;

#second, transform letters that require combinations. Using "x" for "kha", "j'" for "i kratkoe, "shch" for "tshcha", "e'" for "e oborotnoe".
	s/\xB3/Yo/g;
	s/\xF6/Zh/g;
	s/\xEA/J'/g;
	s/\xFE/Ch/g;
	s/\xFB/Sh/g;
	s/\xFD/Shch/g;
	s/\xFC/E'/g;
	s/\xE0/Yu/g;
	s/\xF1/Ya/g;

	s/\xA3/yo/g;
	s/\xD6/zh/g;
	s/\xCA/j'/g;
	s/\xDE/ch/g;
	s/\xDB/sh/g;
	s/\xDD/shch/g;
	s/\xDC/e'/g;
	s/\xC0/yu/g;
	s/\xD1/ya/g;

	if ("@ARGV" =~ /-oldrus/i) {
		# KOI8-C extension: EXPERIMENTAL. Designed not to conflict with KOI8-U.

		s/\xB6/I~/g;	# I roman
		s/\xA6/i~/g;	# i roman
		s/\xB2/E~/g;	# Yat'
		s/\xA2/e~/g;	# yat'
		s/\xBC/~F/g;	# Fita
		s/\xAC/~f/g;	# fita
		s/\xB1/~V/g;	# Izhitsa
		s/\xA1/~v/g;	# izhitsa
	}

#then replace other letters

tr/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE3\xFF\xF9\xF8\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC3\xDF\xD9\xD8/ABVGDEZIKLMNOPRSTUFXC~Y'abvgdeziklmnoprstufxc~y'/;
}

sub translate_lat_to_koi {	#operate on $_ only, translate latinized input to KOI8

	# Rules for digraphs: 1. An all-lowercase digraph must be defined.
	# 2. If the first letter is [A-z], then the digraph is lowercase if both letters are lowercase and uppercase otherwise.
	# 3. If the first letter is not [A-z], then the case of the 2nd letter determins the case of the digraph.

	%translit=(
		# Canonical RL scheme
		"Shch" => "\xFD",
		"shch" => "\xDD",
		"Yo" => "\xB3",
		"yo" => "\xA3",
		"Jo" => "\xB3",
		"jo" => "\xA3",
		"Zh" => "\xF6",
		"zh" => "\xD6",
		"J'" => "\xEA",
		"j'" => "\xCA",
		"J`" => "\xEA",
		"j`" => "\xCA",
		"Ch" => "\xFE",
		"ch" => "\xDE",
		"Sh" => "\xFB",
		"sh" => "\xDB",
		"E'" => "\xFC",
		"e'" => "\xDC",
		"E`" => "\xFC",
		"e`" => "\xDC",
		"`E" => "\xFC",
		"`e" => "\xDC",
		"Yu" => "\xE0",
		"yu" => "\xC0",
		"Ju" => "\xE0",
		"ju" => "\xC0",
		"Ya" => "\xF1",
		"ya" => "\xD1",
		"Ja" => "\xF1",
		"ja" => "\xD1",
	);

	if ("@ARGV" =~ /-oldrus/i) {
		# EXPERIMENTAL: KOI8-C extensions
		$translit{"~F"} = "\xBC";	# Fita
		$translit{"~f"} = "\xAC";	# fita
		$translit{"E~"} = "\xB2";	# Yat'
		$translit{"e~"} = "\xA2";	# yat'
		$translit{"I~"} = "\xB6";	# I roman
		$translit{"i~"} = "\xA6";	# i roman
		$translit{"~V"} = "\xB1";	# Izhitsa
		$translit{"~v"} = "\xA1";	# izhitsa
	}

	%malleable=(	# lowercase
		'~' => "\xDF",
		'`' => "\xD8",
		"'" => "\xD8",
		'@' => "\xDC",
	);

	%malleable_uc=(	# uppercase
		'~' => "\xFF",
		'`' => "\xF8",
		"'" => "\xF8",
		'@' => "\xFC",
	);

	$i=0;	#pointer into the input string ($_)
	
	$EnglishNow=0;	#state flag for the digestion machine
	#now need to set some options
	$want_tex = ("@ARGV" =~ /-tex/i) ? 1 : 0;
	$want_wisv = ("@ARGV" =~ /-wisv/i) ? 1 : 0;
	$want_qisja = ("@ARGV" =~ /-qisja/i) ? 1 : 0;
	$want_kh = ("@ARGV" =~ /-usekh/i) ? 1 : 0;
	
	#need to modify the tables now
	if ($want_kh) {
		$translit{"Kh"} = "\xE8";
		$translit{"kh"} = "\xC8";
	}
		
	$output="";	#to hold the output text

	while ($i < length($_)) {	#loop through the input
		# The current char is substr($_,i,1).
		# Note that $i will not always advance by 1 and sometimes be changed inside &digest_some()
		my $doutput = &digest_some();
		$i += length($doutput);
		$output .= $doutput;
	}
	$_ = $output;
}

sub digest_some {	# Return next output char(s), using $i as read-only position in $_ and using flags $want_tex and $want_wisv

# our state: $EnglishNow=2 if inside $$ or after '\ ', 1 if inside \command, 0 if in Russian.
# the '$' and \commands are all ignored unless $want_tex
	my $thischar = substr($_, $i, 1);	#just caching, aren't going to change it
	my $nextchar = substr($_, $i+1, 1);	#this may be changed

	if ($EnglishNow == 2) {
	  if ($want_tex) {
		if ($thischar . $nextchar eq '$$') {
			$EnglishNow= 0;
			return '$$';
		}
		if ($thischar eq '$') {
			$EnglishNow= 0;
			return '$';
		}
	  }
		# insert any additional switchers here
		if ($thischar . $nextchar eq '\\ ') {
			#switching back to Russian
			$EnglishNow= 0;
			$i += 2;	#incrementing $i here since not returning anything
			return "";
		}
		# ok, English is still going on
		return $thischar;
	} # case of $EnglishNow == 2 is done	

	if ($EnglishNow == 1 and $want_tex) {
		if ($thischar eq ' ' or $thischar eq '\n') {	# terminates \command
			$EnglishNow= 0;
			return $thischar;
		}
		if ($thischar . $nextchar eq '$$') {
			$EnglishNow= 2;
			return '$$';
		}
		if ($thischar eq '$') {
			$EnglishNow= 2;
			return '$';
		}
		if ($thischar eq '\\') {
			if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts another \command right after this one
				$EnglishNow= 1;
				return $thischar;
			}
		}	
		# didn't switch to Russian, continue without translation
		return $thischar;
	} # case of $EnglishNow == 1 is done
	
	if ($EnglishNow == 0) {
	  if ($want_tex) {
		if ($thischar . $nextchar eq '$$') {
			$EnglishNow= 2;
			return '$$';
		}
		if ($thischar eq '$') {
			$EnglishNow= 2;
			return '$';
		}
	  }
	  if ($thischar eq '\\') {
	  	if ($want_tex) {
			if ($nextchar =~ /[0-9A-z@\\\"\':]/) { # starts \command
				$EnglishNow= 1;
				return $thischar;
			}
		}
		if ($nextchar eq ' ') {	# switch to English now
			$EnglishNow = 2;
			$i += 2;
			return "";
		}
		if ($nextchar eq '\\') {	# double backslash, skipping one
			$i += 1;
			return "\\";
		}
		
		#we get a backslash in Russian mode and not followed by space
		#tex mode quirks and double backslashes are already done
		#so we should swallow it and go on with the next char
		$i += 1;
		return "";
	  }	# End of processing backslash char
		# all switches have been processed, now do Russian stuff
		
		# first, the 4-letter combination for "tshcha"
		
		if (substr($_, $i, 4) eq 'shch') {	#lowercase
			$i += 3;
			return $translit{'shch'};
		}
		
		if (substr($_, $i, 4) =~ /shch/i) {	# uppercase: we now know it's not lowercase so any case combination works
			$i += 3;
			return $translit{'Shch'};
		}
		
		#now looking for digraphs
		$digraph = $thischar . $nextchar;
		$digraph =~ tr/A-Z/a-z/;	#now it's all lowercase
		if (defined($translit{$digraph})) {	# Found a digraph!
			if ($nextchar =~ /[A-Z]/ or $thischar =~ /[A-Z]/) {	# uppercase
				$thischar =~ tr/a-z/A-Z/;	# Clobber, clobber
				$nextchar =~ tr/A-Z/a-z/ if ($thischar =~ /[A-Z]/);	# Do not lowercase the second char if the first char is not a letter!
				$digraph = $thischar . $nextchar;
			}
			$i += 1;
			return $translit{$digraph};
		}

		# now search for malleables
		if (defined($malleable{$thischar})) {	# Found a malleable.
			$prevchar = ($i>0) ? substr($_, $i-1, 1) : "";
			if ($thischar eq '`' or $thischar eq "'") {
				if (not ($prevchar =~ /[\@A-Za-z]/) and $nextchar =~ /[\@A-Za-z]/) {	# ' and ` at beginning of word are not translated
					return $thischar;
				}
				
			}
			if ($prevchar eq '\\') {
				return $thischar;	# ' and ` prefixed by \ are not translated
			}
			if ($prevchar eq '^') {	# Special cases.
				return $malleable_uc{$thischar};
			}
			if ($prevchar eq '_') {
				return $malleable{$thischar};
			}
			if (($prevchar =~ /[A-Z \n\t]/ or length($prevchar) == 0) and $nextchar =~ /[A-Z \n\t]/) {
				return $malleable_uc{$thischar};
			}
			return $malleable{$thischar};
		}

		#if we are still here, we have a simple letter
		if ($want_qisja) {
			$thischar = ($thischar eq 'Q') ? $translit{'Ja'} : (($thischar eq 'q') ? $translit{'ja'} : $thischar);
		}
		if ($want_wisv) {
			$thischar = ($thischar eq 'W') ? 'V' : (($thischar eq 'w') ? 'v' : $thischar);
		}
		$thischar =~ tr/ABVGDEZIKLMNOPRSTUFXHCYWQJabvgdeziklmnoprstufxhcywqj/\xE1\xE2\xF7\xE7\xE4\xE5\xFA\xE9\xEB\xEC\xED\xEE\xEF\xF0\xF2\xF3\xF4\xF5\xE6\xE8\xE8\xE3\xF9\xFD\xFD\xEA\xC1\xC2\xD7\xC7\xC4\xC5\xDA\xC9\xCB\xCC\xCD\xCE\xCF\xD0\xD2\xD3\xD4\xD5\xC6\xC8\xC8\xC3\xD9\xDD\xDD\xCA/;
		return $thischar;

	} # case of EnglishNow == 0 is done

}

sub translate_iso_to_uni {
	$output="";
	$i=0;
	while ($i < length($_)) {	#loop through the input char by char, slow
		$thischar=substr($_,$i,1);
		$output .= (unpack("C", $thischar)> 160) ? "\x04" . pack("C", unpack("C" ,$thischar)-160) : "\x00" . $thischar;
		++$i;
	}
	$_=$output;
}

sub translate_uni_to_iso {
	$output="";
	$i=0;
	while ($i < length($_)) {	#loop through the input char by char, slow
		$thischar=substr($_,$i,1);
		if ($thischar eq "\x04" and $i < length($_)-1) {	# Now look at the second part of the 16 bit code.
#			$nextchar=pack("C", unpack("C", substr($_,$i+1,1))+160);
			$nextchar = unpack("C", substr($_,$i+1,1))+160;
			if ($nextchar > 160 and $nextchar < 256) {	#we are in range for ISO cyrillics
				$nextchar = pack("C", $nextchar);
#				$output .= (index($rusiso, $nextchar)>=0) ? $nextchar : substr($_,$i,2);	#this would be too paranoid. Let's give lje, nje &co. a chance.
				$output .= $nextchar;
			} else {	#we are out of range, preserve Unicode as is
				$output .= substr($_,$i,2);
			}
			$i += 2;
		} else {	#we are not looking at a Unicode cyrillic wide char
			$output .= "$thischar" unless ($thischar eq "\x00");
			$i += 1;
		}
	}
	$_=$output;
}

#UTF-8 encoding scheme. We are converting it to ISO or to it from ISO.
#So 1) We create phony ISO codes for our extra letters:
#WinISO	KOI8-C		Irom irom YI yi IE ie Yat' yat' Fita fita Izhitsa izhitsa Psi psi
#B2	A6	"\xB6";	# I roman
#B3	F6	"\xA6";	# i roman
#AF	A7	"\xB7";	# YI (I:)
#BF	F7	"\xA7";	# yi (i:)
#AA	A4	"\xB4";	# IE
#BA	F4	"\xA4";	# ie
#	82	"\xB2";	# Yat'
#	83	"\xA2";	# yat'
#	92	"\xBC";	# Fita
#	93	"\xAC";	# fita
#	94	"\xB1";	# Izhitsa
#	95	"\xA1";	# izhitsa
#	90	"\xBA";	# Psi
#	91	"\xAA";	# psi
#2) All UTF-8 entities will be 2 bytes long, so we create the leading byte by hand.
#For ISO characters c in range 0x80 to 0x9F, the UTF-8 is 0xD1 0x(c+0x20)
#For ISO characters c in range 0xA1 to 0xDF, the UTF-8 is 0xD0 0x(c-0x20)
#For ISO characters c in range 0xE0 to 0xFF, the UTF-8 is 0xD1 0x(c-0x60)

sub translate_iso_to_utf {
	$output="";
	$i=0;
	while ($i < length($_)) {	#loop through the input
		$thischar=substr($_,$i,1);
		$thischarcode=unpack("C", $thischar);
		$output .= ($thischarcode >= 0xE0) ? "\xD1" . pack("C", $thischarcode-0x60) : ($thischarcode >= 0xA1) ? "\xD0" . pack("C", $thischarcode-0x20) : ($thischarcode >= 0x80) ? "\xD1" . pack("C", $thischarcode+0x20) : $thischar;
		++$i;
	}
	$_=$output;
}

sub translate_utf_to_iso {	# Warning: output may be wrong if input contains anything but UTF-8 cyrillic chars or ASCII!
	$output="";
	$i=0;
	while ($i < length($_)) {	#loop through the input
		$thischar=substr($_,$i,1);
		if (($thischar eq "\xD0" or $thischar eq "\xD1")  and $i < length($_)-1) {	# Now look at the second part of the 16 bit code.
			
			$nextchar = unpack("C", substr($_,$i+1,1))-128+160 + (($thischar eq "\xD1") ? 64 : 0);
			$nextchar -= 128	# Compensate for our fake ISO codes for old Russian.
				if ($thischar eq "\xD1" and $nextchar >= 256);
			if ($nextchar > 127 and $nextchar < 256) {	#we are in range for ISO cyrillics plus our fake ISO codes
				$nextchar = pack("C", $nextchar);
#				$output .= (index($rusiso, $nextchar)>=0) ? $nextchar : substr($_,$i,2);	#this would be too paranoid. Let's give lje, nje &co. a chance.
				$output .= $nextchar;
			} else {	#we are out of range, preserve UTF-8 as is
				$output .= substr($_,$i,2);
			}
			$i += 2;
		} else {	#we are not looking at a Unicode cyrillic wide char
			$output .= "$thischar";
			$i += 1;
		}
	}
	$_=$output;
}