#!/usr/bin/perl -w ###### # (In)valid UTF-8 GENerator :-) # Generates multibyte sequences for 7-bit ASCII characters. # # source: http://www.devbox.be, http://pub.devbox.be ### # # $DevBox: iutf8gen.pl,v 1.2 2005/11/30 15:47:21 jimmy Exp $ # # Copyright (C) 2005 Jimmy Scott . Belgium. # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # 3. The names of the authors may not be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. # ###### package main; use strict; sub usage { print STDERR "usage: $0 [num_bytes]\n"; print STDERR " num_bytes defaults to 2\n"; exit 1; } sub iutf8 { my $num = shift; my $str = shift; my $res = ""; my $byte; my $bseq; for ( my $i = 0; $i < length($str); ++$i ) { $byte = substr($str, $i, 1); if ( ord($byte) > 127 ) { print STDERR "$0: skipping byte in offset $i, " . "character is not 7-bit ASCII\n"; next; } # Split the bits according to this table: # 110xxxxx 10xxxxxx # 1110xxxx 10xxxxxx 10xxxxxx # 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx # 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx # 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx my $bits = unpack "B*", $byte; my $f_bits = substr($bits, 0, 2); my $l_bits = substr($bits, 2, 6); # Concatenate the result if ( $num == 6 ) { $bseq = "11111100" . ( "10000000" x 3 ); } elsif ( $num == 5 ) { $bseq = "11111000" . ( "10000000" x 2 ); } elsif ( $num == 4 ) { $bseq = "11110000" . "10000000"; } elsif ( $num == 3 ) { $bseq = "11100000"; } if ( $num == 2 ) { $bseq = "110000" . $f_bits . "10" . $l_bits; } else { $bseq .= "100000" . $f_bits . "10" . $l_bits; } $res .= $bseq; } return pack "B*", $res; } sub main { # Parse arguments if ( scalar(@ARGV) != 1 && scalar(@ARGV) != 2 ) { &usage; } my $str = $ARGV[0]; my $num = $ARGV[1] || 2; my $bseq; my $hseq; my $cseq = ""; if ( ! ( $num =~ /^[2-6]$/ ) ) { print STDERR "$0: num_bytes must be 2 to 6\n"; exit 1; } $bseq = &iutf8($num, $str); $hseq = uc(unpack("H*", $bseq)); for ( my $i = 0; $i < length($hseq); $i += 2 ) { $cseq .= "\\x" . substr($hseq, $i, 2); } print "Result: " . $cseq . "\n"; exit 0; } &main;