forked from Lainports/freebsd-ports
analysis and manipulation of parts of an HTML document, including some common server-side tags, while reproducing verbatim any unrecognised or invalid HTML. It also provides high-level HTML form manipulation functions. WWW: http://jerichohtml.sourceforge.net/doc/index.html PR: ports/124770 Submitted by: Marcin Cieslak <saper at SYSTEM.PL>
15 lines
1.9 KiB
Text
15 lines
1.9 KiB
Text
--- src/java/au/id/jericho/lib/html/StreamEncodingDetector.java.orig 2008-06-17 21:01:53.890292905 +0200
|
||
+++ src/java/au/id/jericho/lib/html/StreamEncodingDetector.java 2008-06-17 21:02:43.940300330 +0200
|
||
@@ -203,9 +203,9 @@
|
||
// Assume the more likely case of four 8-bit characters <= U+00FF.
|
||
// Check whether it fits some common EBCDIC strings that might be found at the start of a document:
|
||
if (b1==0x4C) { // first character is EBCDIC '<' (ASCII 'L'), check a couple more characters before assuming EBCDIC encoding:
|
||
- if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC ("Lo§”" in Windows-1252)
|
||
- if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC ("LZÄÖ" in Windows-1252)
|
||
- if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" ("LÈãÔ" in Windows-1252), or "<htm" ("Lˆ£”" in Windows-1252)
|
||
+ if (b2==0x6F && b3==0xA7 && b4==0x94) return setEncoding(EBCDIC,"default EBCDIC encoding (<?xml...> detected)"); // first four bytes are "<?xm" in EBCDIC
|
||
+ if (b2==0x5A && b3==0xC4 && b4==0xD6) return setEncoding(EBCDIC,"default EBCDIC encoding (<!DOCTYPE...> detected)"); // first four bytes are "<!DO" in EBCDIC
|
||
+ if ((b2&b3&b4&0x80)!=0) return setEncoding(EBCDIC,"default EBCDIC-compatible encoding (HTML element detected)"); // all of the 3 bytes after the '<' have the high-order bit set, indicating EBCDIC letters such as "<HTM" or "<htm"
|
||
// although this is not an exhaustive check for EBCDIC, it is safer to assume a more common preliminary encoding if none of these conditions are met.
|
||
}
|
||
// Now confident that it is not EBCDIC, but some other 8-bit encoding.
|