class HTML::Encoder

Public Class Methods

new() click to toggle source
# File lib/HTML/Encoder.rb, line 7
def initialize()

    @entity2char = {
        'amp'   => '&',  # ampersand
        'gt'    => '>',  # greater than
        'lt'    => '<',  # less than
        'quot'  => '"',  # double quote
        'apos'  => "'",  # single quote

        # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
        'AElig'  => 198.chr,  # capital AE diphthong (ligature)
        'Aacute' => 193.chr,  # capital A, acute accent
        'Acirc'  => 194.chr,  # capital A, circumflex accent
        'Agrave' => 192.chr,  # capital A, grave accent
        'Aring'  => 197.chr,  # capital A, ring
        'Atilde' => 195.chr,  # capital A, tilde
        'Auml'   => 196.chr,  # capital A, dieresis or umlaut mark
        'Ccedil' => 199.chr,  # capital C, cedilla
        'ETH'    => 208.chr,  # capital Eth, Icelandic
        'Eacute' => 201.chr,  # capital E, acute accent
        'Ecirc'  => 202.chr,  # capital E, circumflex accent
        'Egrave' => 200.chr,  # capital E, grave accent
        'Euml'   => 203.chr,  # capital E, dieresis or umlaut mark
        'Iacute' => 205.chr,  # capital I, acute accent
        'Icirc'  => 206.chr,  # capital I, circumflex accent
        'Igrave' => 204.chr,  # capital I, grave accent
        'Iuml'   => 207.chr,  # capital I, dieresis or umlaut mark
        'Ntilde' => 209.chr,  # capital N, tilde
        'Oacute' => 211.chr,  # capital O, acute accent
        'Ocirc'  => 212.chr,  # capital O, circumflex accent
        'Ograve' => 210.chr,  # capital O, grave accent
        'Oslash' => 216.chr,  # capital O, slash
        'Otilde' => 213.chr,  # capital O, tilde
        'Ouml'   => 214.chr,  # capital O, dieresis or umlaut mark
        'THORN'  => 222.chr,  # capital THORN, Icelandic
        'Uacute' => 218.chr,  # capital U, acute accent
        'Ucirc'  => 219.chr,  # capital U, circumflex accent
        'Ugrave' => 217.chr,  # capital U, grave accent
        'Uuml'   => 220.chr,  # capital U, dieresis or umlaut mark
        'Yacute' => 221.chr,  # capital Y, acute accent
        'aacute' => 225.chr,  # small a, acute accent
        'acirc'  => 226.chr,  # small a, circumflex accent
        'aelig'  => 230.chr,  # small ae diphthong (ligature)
        'agrave' => 224.chr,  # small a, grave accent
        'aring'  => 229.chr,  # small a, ring
        'atilde' => 227.chr,  # small a, tilde
        'auml'   => 228.chr,  # small a, dieresis or umlaut mark
        'ccedil' => 231.chr,  # small c, cedilla
        'eacute' => 233.chr,  # small e, acute accent
        'ecirc'  => 234.chr,  # small e, circumflex accent
        'egrave' => 232.chr,  # small e, grave accent
        'eth'    => 240.chr,  # small eth, Icelandic
        'euml'   => 235.chr,  # small e, dieresis or umlaut mark
        'iacute' => 237.chr,  # small i, acute accent
        'icirc'  => 238.chr,  # small i, circumflex accent
        'igrave' => 236.chr,  # small i, grave accent
        'iuml'   => 239.chr,  # small i, dieresis or umlaut mark
        'ntilde' => 241.chr,  # small n, tilde
        'oacute' => 243.chr,  # small o, acute accent
        'ocirc'  => 244.chr,  # small o, circumflex accent
        'ograve' => 242.chr,  # small o, grave accent
        'oslash' => 248.chr,  # small o, slash
        'otilde' => 245.chr,  # small o, tilde
        'ouml'   => 246.chr,  # small o, dieresis or umlaut mark
        'szlig'  => 223.chr,  # small sharp s, German (sz ligature)
        'thorn'  => 254.chr,  # small thorn, Icelandic
        'uacute' => 250.chr,  # small u, acute accent
        'ucirc'  => 251.chr,  # small u, circumflex accent
        'ugrave' => 249.chr,  # small u, grave accent
        'uuml'   => 252.chr,  # small u, dieresis or umlaut mark
        'yacute' => 253.chr,  # small y, acute accent
        'yuml'   => 255.chr,  # small y, dieresis or umlaut mark

        # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
        'copy'   => 169.chr,  # copyright sign
        'reg'    => 174.chr,  # registered sign
        'nbsp'   => 160.chr,  # non breaking space

        # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
        'iexcl'  => 161.chr,
        'cent'   => 162.chr,
        'pound'  => 163.chr,
        'curren' => 164.chr,
        'yen'    => 165.chr,
        'brvbar' => 166.chr,
        'sect'   => 167.chr,
        'uml'    => 168.chr,
        'ordf'   => 170.chr,
        'laquo'  => 171.chr,
        'not'    => 172.chr,
        'shy'    => 173.chr,
        'macr'   => 175.chr,
        'deg'    => 176.chr,
        'plusmn' => 177.chr,
        'sup1'   => 185.chr,
        'sup2'   => 178.chr,
        'sup3'   => 179.chr,
        'acute'  => 180.chr,
        'micro'  => 181.chr,
        'para'   => 182.chr,
        'middot' => 183.chr,
        'cedil'  => 184.chr,
        'ordm'   => 186.chr,
        'raquo'  => 187.chr,
        'frac14' => 188.chr,
        'frac12' => 189.chr,
        'frac34' => 190.chr,
        'iquest' => 191.chr,
        'times'  => 215.chr,
        'divide' => 247.chr,
    }

    if RUBY_VERSION > '1.8.7'
        HTML::Encoder::Unicode.unicode_mapping.each{ |k,v|
            @entity2char[k] = v
        }
    end

    @char2entity = Hash[@entity2char.map { |k, v| [v, "&#{k};"] }]

    for i in 0..255
        unless @char2entity.has_key?( i.chr )
            @char2entity[i.chr] = "&##{i};";
        end
    end

end

Public Instance Methods

encode( string, *args ) click to toggle source
# File lib/HTML/Encoder.rb, line 135
def encode( string, *args )

    if (! args[0].nil? and ! args[0].to_s.empty?)
        lookup = {}
        args[0].to_s.each_char{ |c|
            lookup[c] = @char2entity[c].nil? ? num_entity(c) : @char2entity[c]
        }
        string = string.to_s.gsub( /./ ) {|c| lookup[c].nil? ? c : lookup[c] }
    else             
        # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
        string = string.to_s.gsub( /([^\n\r\t !\#\$%\(-;=?-~])/ ) {|c| 
            @char2entity[c].nil? ? num_entity(c) : @char2entity[c]
        }
    end

    return string

end
encode_hex( *args ) click to toggle source
# File lib/HTML/Encoder.rb, line 154
def encode_hex( *args )
    tmp = @char2entity
    @char2entity = {}
    string = encode( *args )
    @char2entity = tmp
    return string
end
num_entity( char ) click to toggle source
# File lib/HTML/Encoder.rb, line 162
def num_entity( char )
    return sprintf( '&#x%X;', char.unpack('C')[0] )
end