#------------------------------------------------------------------------------
#$Author: andrius $
#$Date: 2020-04-17 15:02:20 -0400 (Fri, 17 Apr 2020) $ 
#$Revision: 6668 $
#$URL: svn://saulius-grazulis.lt/restful/tags/v0.15.2/lib/RestfulDB/Charman.pm $
#------------------------------------------------------------------------------
#*
#  Manage Unicode character conversions
#**

package RestfulDB::Charman;
require Exporter;
our @ISA = qw( Exporter );
@EXPORT_OK = qw( remove_diacritics decode_hex_string );

use Unicode::Normalize qw( NFD );
use Encode qw( decode );

use strict;
use warnings;

# Remove all diacritic symbols from Unicode characters; uses a
# (simplified) method from:
# http://ahinea.com/en/tech/accented-translate.html
# (2011-12-10; last viewed 2017-12-12).

sub remove_diacritics
{
    my ($string) = @_;

    my $string_without_diacritics = $string;

    $string_without_diacritics = NFD( $string );

    for (($string_without_diacritics)) {
        # remove all combining characters:

        s/\pM//g;

        # additional normalizations:

        s/\x{00df}/ss/g;  ##  German beta “ß” -> “ss”
        s/\x{00c6}/AE/g;  ##  Æ
        s/\x{00e6}/ae/g;  ##  æ
        s/\x{0132}/IJ/g;  ##  Ĳ
        s/\x{0133}/ij/g;  ##  ĳ
        s/\x{0152}/Oe/g;  ##  Œ
        s/\x{0153}/oe/g;  ##  œ

        tr/\x{00d0}\x{0110}\x{00f0}\x{0111}\x{0126}\x{0127}/DDddHh/; # ÐĐðđĦħ
        tr/\x{0131}\x{0138}\x{013f}\x{0141}\x{0140}\x{0142}/ikLLll/; # ıĸĿŁŀł
        tr/\x{014a}\x{0149}\x{014b}\x{00d8}\x{00f8}\x{017f}/NnnOos/; # ŊŉŋØøſ
        tr/\x{00de}\x{0166}\x{00fe}\x{0167}/TTtt/;                   # ÞŦþŧ
    }

    return $string_without_diacritics;
}

# Decodes hex-encoded Unicode strings, as those received from openssl
# -subject generated without UTF8 support:

# e.g.: "Gra\xC5\xBEulis" -> "Gražulis"

# CAVEAT: only two byte UTF-8 sequences are supported at the moment!

sub decode_hex_string
{
    my ($string) = @_;

    # The code below implements UTF-8 byte interpretation from the
    # HEX digits obtained as a text:

    # Convert all encoded low-bytes to characters:
    $string =~
        s/(?:\\x(..))/
          eval("0x".$1) < 0x80 ?
              chr( eval("0x".$1) ) : "\\x" . $1 /egx;

    # Convert all remaining double-byte sequences:
    $string =~
        s/(?:\\x(..))(?:\\x(..))/
              chr( (( eval("0x".$1) & 0x1F) << 6) + 
                    ( eval("0x".$2) & 0x3F) ) /egx;
    
    return $string;
}

1;
