arsd.characterencodings source code

1 // helper program is in ~me/encodings.d to make more tables from wikipedia
2 
3 /**
4 	This is meant to help get data from the wild into utf8 strings
5 	so you can work with them easily inside D.
6 
7 	The main function is convertToUtf8(), which takes a byte array
8 	of your raw data (a byte array because it isn't really a D string
9 	yet until it is utf8), and a runtime string telling it's current
10 	encoding.
11 
12 	The current encoding argument is meant to come from the data's
13 	metadata, and is flexible on exact format - it is case insensitive
14 	and takes several variations on the names.
15 
16 	This way, you should be able to send it the encoding string directly
17 	from an XML document, a HTTP header, or whatever you have, and it
18 	ought to just work.
19 
20 	Example:
21 		auto data = cast(immutable(ubyte)[])
22 			std.file.read("my-windows-file.txt");
23 		string utf8String = convertToUtf8(data, "windows-1252");
24 		// utf8String can now be used
25 
26 
27 	The encodings currently implemented for decoding are:
28 		UTF-8 (a no-op; it simply casts the array to string)
29 		UTF-16,
30 		UTF-32,
31 		Windows-1252,
32 		ISO 8859 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, and 16.
33 
34 	It treats ISO 8859-1, Latin-1, and Windows-1252 the same way, since
35 	those labels are pretty much de-facto the same thing in wild documents.
36 
37 
38 	This module currently makes no attempt to look at control characters.
39 */
40 module arsd.characterencodings;
41 
42 import std.string;
43 import std.array;
44 import std.conv;
45 
46 /// Like convertToUtf8, but if the encoding is unknown, it just strips all chars > 127 and calls it done instead of throwing
47 string convertToUtf8Lossy(immutable(ubyte)[] data, string dataCharacterEncoding) {
48 	try {
49 		auto ret = convertToUtf8(data, dataCharacterEncoding);
50 		import std.utf;
51 		validate(ret);
52 		return ret;
53 	} catch(Exception e) {
54 		string ret;
55 		foreach(b; data)
56 			if(b < 128)
57 				ret ~= b;
58 		return ret;
59 	}
60 }
61 
62 /// Takes data from a given character encoding and returns it as UTF-8
63 string convertToUtf8(immutable(ubyte)[] data, string dataCharacterEncoding) {
64 	// just to normalize the passed string...
65 	auto encoding = dataCharacterEncoding.toLower();
66 	encoding = encoding.replace(" ", "");
67 	encoding = encoding.replace("-", "");
68 	encoding = encoding.replace("_", "");
69 	// should be good enough.
70 
71 	switch(encoding) {
72 		default:
73 			throw new Exception("I don't know how to convert " ~ dataCharacterEncoding ~ " to UTF-8");
74 		// since the input is immutable, these are ok too.
75 		// just want to cover all the bases with one runtime function.
76 		case "utf16":
77 		case "utf16le":
78 			return to!string(cast(wstring) data);
79 		case "utf32":
80 		case "utf32le":
81 			return to!string(cast(dstring) data);
82 		// FIXME: does the big endian to little endian conversion work?
83 		case "ascii":
84 		case "usascii": // utf-8 is a superset of ascii
85 		case "utf8":
86 			return cast(string) data;
87 		// and now the various 8 bit encodings we support.
88 		case "windows1252":
89 			return decodeImpl(data, ISO_8859_1, Windows_1252);
90 		case "windows1251":
91 			return decodeImpl(data, Windows_1251, Windows_1251_Lower);
92 		case "koi8r":
93 			return decodeImpl(data, KOI8_R, KOI8_R_Lower);
94 		case "latin1":
95 		case "iso88591":
96 			// Why am I putting Windows_1252 here? A lot of
97 			// stuff in the wild is mislabeled, so this will
98 			// do some good in the Just Works department.
99 			// Regardless, I don't handle the
100 			// control char set in that zone anyway right now.
101 			return decodeImpl(data, ISO_8859_1, Windows_1252);
102 		case "iso88592":
103 			return decodeImpl(data, ISO_8859_2);
104 		case "iso88593":
105 			return decodeImpl(data, ISO_8859_3);
106 		case "iso88594":
107 			return decodeImpl(data, ISO_8859_4);
108 		case "iso88595":
109 			return decodeImpl(data, ISO_8859_5);
110 		case "iso88596":
111 			return decodeImpl(data, ISO_8859_6);
112 		case "iso88597":
113 			return decodeImpl(data, ISO_8859_7);
114 		case "iso88598":
115 			return decodeImpl(data, ISO_8859_8);
116 		case "iso88599":
117 			return decodeImpl(data, ISO_8859_9);
118 		case "iso885910":
119 			return decodeImpl(data, ISO_8859_10);
120 		case "iso885911":
121 			return decodeImpl(data, ISO_8859_11);
122 		case "iso885913":
123 			return decodeImpl(data, ISO_8859_13);
124 		case "iso885914":
125 			return decodeImpl(data, ISO_8859_14);
126 		case "iso885915":
127 			return decodeImpl(data, ISO_8859_15);
128 		case "iso885916":
129 			return decodeImpl(data, ISO_8859_16);
130 	}
131 
132 	assert(0);
133 }
134 
135 /// Tries to determine the current encoding based on the content.
136 /// Only really helps with the UTF variants.
137 /// Returns null if it can't be reasonably sure.
138 string tryToDetermineEncoding(in ubyte[] rawdata) {
139 	import std.utf;
140 	try {
141 		validate!string(cast(string) rawdata);
142 		// the odds of non stuff validating as utf-8 are pretty low
143 		return "UTF-8";
144 	} catch(UTFException t) {
145 		// it's definitely not UTF-8!
146 		// we'll look at the first few characters. If there's a
147 		// BOM, it's probably UTF-16 or UTF-32
148 
149 		if(rawdata.length > 4) {
150 			// not checking for utf8 bom; if it was that, we
151 			// wouldn't be here.
152 			if(rawdata[0] == 0xff && rawdata[1] == 0xfe)
153 				return "UTF-16 LE";
154 			else if(rawdata[0] == 0xfe && rawdata[1] == 0xff)
155 				return "UTF-16 BE";
156 			else if(rawdata[0] == 0x00 && rawdata[1] == 0x00
157 			     && rawdata[2] == 0xfe && rawdata[3] == 0xff)
158 				return "UTF-32 BE";
159 			else if(rawdata[0] == 0xff && rawdata[1] == 0xfe
160 			     && rawdata[2] == 0x00 && rawdata[3] == 0x00)
161 				return "UTF-32 LE";
162 			else {
163 				// this space is intentionally left blank
164 			}
165 		}
166 	}
167 
168 	// we don't know with enough confidence. The app will have to find another way.
169 	return null;
170 }
171 
172 // this function actually does the work, using the translation tables
173 // below.
174 string decodeImpl(in ubyte[] data, in dchar[] chars160to255, in dchar[] chars128to159 = null, in dchar[] chars0to127 = null)
175 	in {
176 		assert(chars160to255.length == 256 - 160);
177 		assert(chars128to159 is null || chars128to159.length == 160 - 128);
178 		assert(chars0to127 is null || chars0to127.length == 128 - 0);
179 	}
180 	out(ret) {
181 		import std.utf;
182 		validate(ret);
183 	}
184 body {
185 	string utf8;
186 
187 	/// I'm sure this could be a lot more efficient, but whatever, it
188 	/// works.
189 	foreach(octet; data) {
190 		if(octet < 128) {
191 			if(chars0to127 !is null)
192 				utf8 ~= chars0to127[octet];
193 			else
194 				utf8 ~= cast(char) octet; // ascii is the same
195 		} else if(octet < 160) {
196 			if(chars128to159 !is null)
197 				utf8 ~= chars128to159[octet - 128];
198 			else
199 				utf8 ~= " ";
200 		} else {
201 			utf8 ~= chars160to255[octet - 160];
202 		}
203 	}
204 
205 	return utf8;
206 }
207 
208 
209 // Here come the translation tables.
210 
211 // this table gives characters for decimal 128 through 159.
212 // the < 128 characters are the same as ascii, and > 159 the same as
213 // iso 8859 1, seen below.
214 immutable dchar[] Windows_1252 = [
215 	'€', ' ', '‚', 'ƒ', '„', '…', '†', '‡',
216 	'ˆ', '‰', 'Š', '‹', 'Œ', ' ', 'Ž', ' ',
217 	' ', '‘', '’', '“', '”', '•', '–', '—',
218 	'˜', '™', 'š', '›', 'œ', ' ', 'ž', 'Ÿ'];
219 
220 // the following tables give the characters from decimal 160 up to 255
221 // in the given encodings.
222 
223 immutable dchar[] ISO_8859_1 = [
224 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
225 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
226 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
227 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
228 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
229 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
230 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
231 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
232 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
233 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
234 	'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
235 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
236 
237 immutable dchar[] ISO_8859_2 = [
238 	' ', 'Ą', '˘', 'Ł', '¤', 'Ľ', 'Ś', '§',
239 	'¨', 'Š', 'Ş', 'Ť', 'Ź', '', 'Ž', 'Ż',
240 	'°', 'ą', '˛', 'ł', '´', 'ľ', 'ś', 'ˇ',
241 	'¸', 'š', 'ş', 'ť', 'ź', '˝', 'ž', 'ż',
242 	'Ŕ', 'Á', 'Â', 'Ă', 'Ä', 'Ĺ', 'Ć', 'Ç',
243 	'Č', 'É', 'Ę', 'Ë', 'Ě', 'Í', 'Î', 'Ď',
244 	'Đ', 'Ń', 'Ň', 'Ó', 'Ô', 'Ő', 'Ö', '×',
245 	'Ř', 'Ů', 'Ú', 'Ű', 'Ü', 'Ý', 'Ţ', 'ß',
246 	'ŕ', 'á', 'â', 'ă', 'ä', 'ĺ', 'ć', 'ç',
247 	'č', 'é', 'ę', 'ë', 'ě', 'í', 'î', 'ď',
248 	'đ', 'ń', 'ň', 'ó', 'ô', 'ő', 'ö', '÷',
249 	'ř', 'ů', 'ú', 'ű', 'ü', 'ý', 'ţ', '˙'];
250 
251 immutable dchar[] ISO_8859_3 = [
252 	' ', 'Ħ', '˘', '£', '¤', ' ', 'Ĥ', '§',
253 	'¨', 'İ', 'Ş', 'Ğ', 'Ĵ', '', ' ', 'Ż',
254 	'°', 'ħ', '²', '³', '´', 'µ', 'ĥ', '·',
255 	'¸', 'ı', 'ş', 'ğ', 'ĵ', '½', ' ', 'ż',
256 	'À', 'Á', 'Â', ' ', 'Ä', 'Ċ', 'Ĉ', 'Ç',
257 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
258 	' ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ġ', 'Ö', '×',
259 	'Ĝ', 'Ù', 'Ú', 'Û', 'Ü', 'Ŭ', 'Ŝ', 'ß',
260 	'à', 'á', 'â', ' ', 'ä', 'ċ', 'ĉ', 'ç',
261 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
262 	' ', 'ñ', 'ò', 'ó', 'ô', 'ġ', 'ö', '÷',
263 	'ĝ', 'ù', 'ú', 'û', 'ü', 'ŭ', 'ŝ', '˙'];
264 
265 immutable dchar[] ISO_8859_4 = [
266 	' ', 'Ą', 'ĸ', 'Ŗ', '¤', 'Ĩ', 'Ļ', '§',
267 	'¨', 'Š', 'Ē', 'Ģ', 'Ŧ', '', 'Ž', '¯',
268 	'°', 'ą', '˛', 'ŗ', '´', 'ĩ', 'ļ', 'ˇ',
269 	'¸', 'š', 'ē', 'ģ', 'ŧ', 'Ŋ', 'ž', 'ŋ',
270 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
271 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ī',
272 	'Đ', 'Ņ', 'Ō', 'Ķ', 'Ô', 'Õ', 'Ö', '×',
273 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ũ', 'Ū', 'ß',
274 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
275 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ī',
276 	'đ', 'ņ', 'ō', 'ķ', 'ô', 'õ', 'ö', '÷',
277 	'ø', 'ų', 'ú', 'û', 'ü', 'ũ', 'ū', '˙'];
278 
279 immutable dchar[] ISO_8859_5 = [
280 	' ', 'Ё', 'Ђ', 'Ѓ', 'Є', 'Ѕ', 'І', 'Ї',
281 	'Ј', 'Љ', 'Њ', 'Ћ', 'Ќ', '', 'Ў', 'Џ',
282 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
283 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
284 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
285 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
286 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
287 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
288 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
289 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я',
290 	'№', 'ё', 'ђ', 'ѓ', 'є', 'ѕ', 'і', 'ї',
291 	'ј', 'љ', 'њ', 'ћ', 'ќ', '§', 'ў', 'џ'];
292 
293 immutable dchar[] ISO_8859_6 = [
294 	' ', ' ', ' ', ' ', '¤', ' ', ' ', ' ',
295 	' ', ' ', ' ', ' ', '،', '', ' ', ' ',
296 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
297 	' ', ' ', ' ', '؛', ' ', ' ', ' ', '؟',
298 	' ', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا',
299 	'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د',
300 	'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط',
301 	'ظ', 'ع', 'غ', ' ', ' ', ' ', ' ', ' ',
302 	'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه',
303 	'و', 'ى', 'ي', 'ً', 'ٌ', 'ٍ', 'َ', 'ُ',
304 	'ِ', 'ّ', 'ْ', ' ', ' ', ' ', ' ', ' ',
305 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '];
306 
307 immutable dchar[] ISO_8859_7 = [
308 	' ', '‘', '’', '£', '€', '₯', '¦', '§',
309 	'¨', '©', 'ͺ', '«', '¬', '', ' ', '―',
310 	'°', '±', '²', '³', '΄', '΅', 'Ά', '·',
311 	'Έ', 'Ή', 'Ί', '»', 'Ό', '½', 'Ύ', 'Ώ',
312 	'ΐ', 'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η',
313 	'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο',
314 	'Π', 'Ρ', ' ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ',
315 	'Ψ', 'Ω', 'Ϊ', 'Ϋ', 'ά', 'έ', 'ή', 'ί',
316 	'ΰ', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η',
317 	'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο',
318 	'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ',
319 	'ψ', 'ω', 'ϊ', 'ϋ', 'ό', 'ύ', 'ώ', ' '];
320 
321 immutable dchar[] ISO_8859_8 = [
322 	' ', ' ', '¢', '£', '¤', '¥', '¦', '§',
323 	'¨', '©', '×', '«', '¬', '', '®', '¯',
324 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
325 	'¸', '¹', '÷', '»', '¼', '½', '¾', ' ',
326 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
327 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
328 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
329 	' ', ' ', ' ', ' ', ' ', ' ', ' ', '‗',
330 	'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח',
331 	'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן',
332 	'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק',
333 	//                        v    v    those are wrong
334 	'ר', 'ש', 'ת', ' ', ' ', ' ', ' ', ' ']; // FIXME:  those ones marked wrong are supposed to be left to right and right to left markers, not spaces. lol maybe it isn't wrong
335 
336 immutable dchar[] ISO_8859_9 = [
337 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
338 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
339 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
340 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
341 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
342 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
343 	'Ğ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
344 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'İ', 'Ş', 'ß',
345 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
346 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
347 	'ğ', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
348 	'ø', 'ù', 'ú', 'û', 'ü', 'ı', 'ş', 'ÿ'];
349 
350 immutable dchar[] ISO_8859_10 = [
351 	' ', 'Ą', 'Ē', 'Ģ', 'Ī', 'Ĩ', 'Ķ', '§',
352 	'Ļ', 'Đ', 'Š', 'Ŧ', 'Ž', '', 'Ū', 'Ŋ',
353 	'°', 'ą', 'ē', 'ģ', 'ī', 'ĩ', 'ķ', '·',
354 	'ļ', 'đ', 'š', 'ŧ', 'ž', '―', 'ū', 'ŋ',
355 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
356 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ï',
357 	'Ð', 'Ņ', 'Ō', 'Ó', 'Ô', 'Õ', 'Ö', 'Ũ',
358 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
359 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
360 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ï',
361 	'ð', 'ņ', 'ō', 'ó', 'ô', 'õ', 'ö', 'ũ',
362 	'ø', 'ų', 'ú', 'û', 'ü', 'ý', 'þ', 'ĸ'];
363 
364 immutable dchar[] ISO_8859_11 = [
365 	' ', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง',
366 	'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ',
367 	'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
368 	'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
369 	'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว',
370 	'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ',
371 	'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื',
372 	'ุ', 'ู', 'ฺ', ' ', ' ', ' ', ' ', '฿',
373 	'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็',
374 	'่', '้', '๊', '๋', '์', 'ํ', '๎', '๏',
375 	'๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗',
376 	'๘', '๙', '๚', '๛', ' ', ' ', ' ', ' '];
377 
378 immutable dchar[] ISO_8859_13 = [
379 	' ', '”', '¢', '£', '¤', '„', '¦', '§',
380 	'Ø', '©', 'Ŗ', '«', '¬', '', '®', 'Æ',
381 	'°', '±', '²', '³', '“', 'µ', '¶', '·',
382 	'ø', '¹', 'ŗ', '»', '¼', '½', '¾', 'æ',
383 	'Ą', 'Į', 'Ā', 'Ć', 'Ä', 'Å', 'Ę', 'Ē',
384 	'Č', 'É', 'Ź', 'Ė', 'Ģ', 'Ķ', 'Ī', 'Ļ',
385 	'Š', 'Ń', 'Ņ', 'Ó', 'Ō', 'Ő', 'Ö', '×',
386 	'Ų', 'Ł', 'Ś', 'Ū', 'Ü', 'Ż', 'Ž', 'ß',
387 	'ą', 'į', 'ā', 'ć', 'ä', 'å', 'ę', 'ē',
388 	'č', 'é', 'ź', 'ė', 'ģ', 'ķ', 'ī', 'ļ',
389 	'š', 'ń', 'ņ', 'ó', 'ō', 'ő', 'ö', '÷',
390 	'ų', 'ł', 'ś', 'ū', 'ü', 'ż', 'ž', '’'];
391 
392 immutable dchar[] ISO_8859_14 = [
393 	' ', 'Ḃ', 'ḃ', '£', 'Ċ', 'ċ', 'Ḋ', '§',
394 	'Ẁ', '©', 'Ẃ', 'ḋ', 'Ỳ', '', '®', 'Ÿ',
395 	'Ḟ', 'ḟ', 'Ġ', 'ġ', 'Ṁ', 'ṁ', '¶', 'Ṗ',
396 	'ẁ', 'ṗ', 'ẃ', 'Ṡ', 'ỳ', 'Ẅ', 'ẅ', 'ṡ',
397 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
398 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
399 	'Ŵ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ṫ',
400 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Ŷ', 'ß',
401 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
402 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
403 	'ŵ', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', 'ṫ',
404 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ŷ', 'ÿ'];
405 
406 immutable dchar[] ISO_8859_15 = [
407 	' ', '¡', '¢', '£', '€', '¥', 'Š', '§',
408 	'š', '©', 'ª', '«', '¬', '', '®', '¯',
409 	'°', '±', '²', '³', 'Ž', 'µ', '¶', '·',
410 	'ž', '¹', 'º', '»', 'Œ', 'œ', 'Ÿ', '¿',
411 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
412 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
413 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', '×',
414 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
415 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
416 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
417 	'ð', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', '÷',
418 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
419 
420 immutable dchar[] ISO_8859_16 = [
421 	' ', 'Ą', 'ą', 'Ł', '€', '„', 'Š', '§',
422 	'š', '©', 'Ș', '«', 'Ź', '', 'ź', 'Ż',
423 	'°', '±', 'Č', 'ł', 'Ž', '”', '¶', '·',
424 	'ž', 'č', 'ș', '»', 'Œ', 'œ', 'Ÿ', 'ż',
425 	'À', 'Á', 'Â', 'Ă', 'Ä', 'Ć', 'Æ', 'Ç',
426 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
427 	'Ð', 'Ń', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ś',
428 	'Ű', 'Ù', 'Ú', 'Û', 'Ü', 'Ę', 'Ț', 'ß',
429 	'à', 'á', 'â', 'ă', 'ä', 'ć', 'æ', 'ç',
430 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
431 	'đ', 'ń', 'ò', 'ó', 'ô', 'ő', 'ö', 'ś',
432 	'ű', 'ù', 'ú', 'û', 'ü', 'ę', 'ț', 'ÿ'];
433 
434 immutable dchar[] KOI8_R_Lower = [
435 	'─', '│', '┌', '┐', '└', '┘', '├', '┤',
436 	'┬', '┴', '┼', '▀', '▄', '█', '▌', '▐',
437 	'░', '▒', '▓', '⌠', '■', '∙', '√', '≈',
438 	'≤', '≥', '\u00a0', '⌡', '°', '²', '·', '÷'];
439 
440 immutable dchar[] KOI8_R = [
441 	'═', '║', '╒', 'ё', '╓', '╔', '╕', '╖',
442 	'╗', '╘', '╙', '╚', '╛', '╜', '╝', '╞',
443 	'╟', '╠', '╡', 'ё', '╢', '╣', '╤', '╥',
444 	'╦', '╧', '╨', '╩', '╪', '╫', '╬', '©',
445 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
446 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
447 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
448 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ',
449 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
450 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
451 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
452 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ'];
453 
454 immutable dchar[] Windows_1251_Lower = [
455 	'Ђ', 'Ѓ', '‚', 'ѓ', '„', '…', '†', '‡',
456 	'€', '‰', 'Љ', '‹', 'Њ', 'Ќ', 'Ћ', 'Џ',
457 	'ђ', '‘', '’', '“', '”', '•', '–', '—',
458 	' ', '™', 'љ', '›', 'њ', 'ќ', 'ћ', 'џ'];
459 
460 immutable dchar[] Windows_1251 = [
461 	' ', 'Ў', 'ў', 'Ј', '¤', 'Ґ', '¦', '§',
462 	'Ё', '©', 'Є', '«', '¬', '', '®', 'Ї',
463 	'°', '±', 'І', 'і', 'ґ', 'µ', '¶', '·',
464 	'ё', '№', 'є', '»', 'ј', 'Ѕ', 'ѕ', 'ї',
465 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
466 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
467 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
468 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
469 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
470 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
471 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
472 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'];
473 
474