Matej Tymes's Weblog: diacritique

Not long time ago I visited a hell. The hell was called javascript localization. And the headache started when I wanted to sort some strings. To do so I needed to compare them first. This would seem as a simple task if you would be an english gentleman without the fear of diacritiques. But as I live and work in a country where diacritique is ever present I needed to process it properly. Method localeCompare was working differently on every browser. Also if method indexOf was used for searching trough text with diacritique using search expression without diacritique it was not working properly.

So I once again ended with something I don't like very much. I wrote my custom javascript util that could be used like this:

localeHelper.diacritiqueComparison('čerešne', 'citron'); // = 1
localeHelper.diacritiqueComparison('čerešne', 'hrozno'); // = -1

// this is case insensitive
localeHelper.containsDiacritiqueText('štart', 'st');  // = true
localeHelper.containsDiacritiqueText('štart', 'št');  // = true
localeHelper.containsDiacritiqueText('štart', 'tar'); // = true
localeHelper.containsDiacritiqueText('trend', 'tr');  // = true
localeHelper.containsDiacritiqueText('trend', 'ťr');  // = false (as we explicitly want the symbol 'ť' and not 't')
localeHelper.containsDiacritiqueText('trend', 'b');   // = false (symbol 'b' is not present)

The final code looked like this:

function LocaleHelper() { // constructor

    var i;

    // source: http://en.wikipedia.org/wiki/Alphabets_derived_from_the_Latin
    // supported languages :
    // austro-bavarian, belarusian, croatian, czech, dutch, estonian, finish, french, german, hungarian, irish, italian,
    // latvian, lithuanian, polish, portuquese, romanian, slovak, sorbian, spanish, swedish, turkish
    //
    // todo: not sure how to or if I should process german letter 'ß'
    var localUpperVowelList = "ÁÀÂÄĂĀÃÅĄÆÉÈĖÊËĚĒĘÍÌİÎÏĪĮÓÒÔÖÕŐŒÚÙÛÜŬŪŰŮŲÝŸ";
    var latinUpperVowelList = "AAAAAAAAAAEEEEEEEEIIIIIIIOOOOOOOUUUUUUUUUYY";

    var localLowerVowelList = "áàâäăāãåąæéèėêëěēęıíìîïīįóòôöõőœúùûüŭūűůųýÿ";
    var latinLowerVowelList = "aaaaaaaaaaeeeeeeeeiiiiiiiooooooouuuuuuuuuyy";

    var localUpperConsonantList = "ĆČÇĎĐĞĢĶĹĻŁĽŃŇÑŅŔŘŚŠŞȘŤŢṬŹŻŽ";
    var latinUpperConsonantList = "CCCDDGGKLLLLNNNNRRSSSSTTTZZZ";

    var localLowerConsonantList = "ćčçďđğģķĺļłľńňñņŕřśšşșťţṭźżž";
    var latinLowerConsonantList = "cccddggkllllnnnnrrsssstttzzz";

    this.charMap = [];
    for (i = 0; i < localUpperVowelList.length; i++) {
        this.charMap[localUpperVowelList.charAt(i)] = latinUpperVowelList.charAt(i);
    }
    for (i = 0; i < localLowerVowelList.length; i++) {
        this.charMap[localLowerVowelList.charAt(i)] = latinLowerVowelList.charAt(i);
    }
    for (i = 0; i < localUpperConsonantList.length; i++) {
        this.charMap[localUpperConsonantList.charAt(i)] = latinUpperConsonantList.charAt(i);
    }
    for (i = 0; i < localLowerConsonantList.length; i++) {
        this.charMap[localLowerConsonantList.charAt(i)] = latinLowerConsonantList.charAt(i);
    }
}


LocaleHelper.prototype = {

    removeCharDiacritique : function(charToProcess) {

        var result = this.charMap[charToProcess];
        if ((result == undefined) || (result == null)) {
            result = charToProcess;
        }

        return result;
    },

    localeCharCompare : function(charA, charB) {

        var newCharA = this.removeCharDiacritique(charA);
        var newCharB = this.removeCharDiacritique(charB);

        return (newCharA == newCharB) ? 0 : ((newCharA < newCharB) ? -1 : 1);

        // removed: doesn't work on every browser
//        return charA.localeCompare(charB);
    },

    isLatinLetter : function(character) {
        return (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z');
    },


    // case sensitivity is used only when the words have the same letters (they are read the same way)
    diacritiqueComparison : function(textA, textB) {
        // todo: note: in Lithuanian alphabet the 'y' character is just before before 'j' - so this algorithm won't work properly

        var result = 0;

        var caseDiff = 0; // difference in case sensitiveness
        var minLength = Math.min(textA.length, textB.length);
        for (var i = 0; i < minLength; i++) {
            var charA = textA.charAt(i);
            var charB = textB.charAt(i);
            var lowerA = charA.toLocaleLowerCase();
            var lowerB = charB.toLocaleLowerCase();

            result = this.localeCharCompare(lowerA, lowerB);
            if (result == 0 && lowerA != lowerB) {
                result = (lowerA < lowerB) ? -1 : 1;
            }

            if (result == 0) {
                if (caseDiff == 0 && charA != charB) { // first most left difference in case is the only important one
                    caseDiff = (charA < charB) ? -1 : 1;
                }
            } else {
                break;
            }
        }

        if (result == 0) {
            if (textA.length != textB.length) {
                result = (textA.length < textB.length) ? -1 : 1;
            } else {
                result = caseDiff; // if the strings are identical let the case sensitive difference decide
            }
        }

        return result;
    },

    containsDiacritiqueText : function (fullText, searchText) {

        var textA = fullText;
        var textB = searchText;

        var result = false;

        if (textB.length == 0) {
            result = true;
        } else if (textA.length >= textB.length) {
            for (var i = 0 ; i < textA.length - textB.length + 1; i++) {
                var found = true;

                for (var j = 0; j < textB.length; j++) {
                    var charA = textA.charAt(i + j).toLocaleLowerCase();
                    var charB = textB.charAt(j).toLocaleLowerCase();

                    if (charA != charB) {
                        if (this.localeCharCompare(charA, charB) != 0) {
                            found = false;
                            break;
                        } else if (!this.isLatinLetter(charB)) {
                            found = false;
                            break;
                        }
                    }
                }

                if (found === true) {
                    result = true;
                    break;
                }
            }
        }

        return result;
    }
};


var localeHelper = new LocaleHelper();

Dnes som od zákazníka dostal zadanie napísať v javascripte metodu pre zotriedenie textov. Čo by bola celkom jednoduchá úloha, ak by išlo anglické texty. No bohužiaľ, naša slovenská (a takisto česká) abeceda má jednu nechutnú vec: diakritiku.

Ak sa teda snažíte zotriediť tieto texty:

"cudzí", "čučoriedka", "ťava", "tŕň", "trstina"

dostanete pri štandartnom porovnávaní textov túto postupnosť:

"cudzí", "trstina", "tŕň", "čučoriedka", "ťava"

A to nie je práve najlepšie usporiadanie (štandartne su prvé veľké písmená bez diakritiky, nasledované malými písmenami bez diakritiky, veľké písmená s diakritikou a na záver malé písmená s diakritikou).

Klasické porovnanie teda nie je dostačujúce. Skúšal som teda nájsť niečo vhodnejšie a narazil som na celkom peknú metódu:

textA.localeCompare(textB)

S jej použitím som dosiahol o trošku lepší, nie však dostačujúci výsledok:

"čučoriedka", "cudzí", "ťava", "tŕň", "trstina"

Problém tejto metódy je že diakritiku úplne odignoruje a teda nikdy nedá znak s diakritikou za znak bez diakritiky, ale tieto znaky majú v porovnaní identické postavenie (čo spôsobilo, že znak 'č' sa ocitol pred znakom 'c', znak 'ť' pred znakom 't' a znak 'ŕ' pred znakom 'r').

K tomu sa ešte pridali problémy s kapitálkami. Skúšal som nájsť nejaké vhodné riešenie na internete bohužiaľ žiadne nebolo dostačujúce (plus nemal som chuť vymenovávať všetky znaky s diakritikou - som detailista a chcel som aby to fungovalo pre všetky jazyky odvodené z latinskej abecedy).

A tak som dospel k niečomu, čo normálne robia všetci nadšený programátori (a čo sa štandartne považuje za chybu). Napíšem si túto metódu sám. A toto je čo nakoniec vzniklo:

   function diacritiqueComparison(textA, textB) {
       var result = 0;

       var caseDiff = 0; // difference in case sensitiveness
       var minLength = Math.min(textA.length, textB.length);
       for (var i = 0; i < minLength; i++) {
             var charA = textA.charAt(i);
             var charB = textB.charAt(i);
             var lowerA = charA.toLocaleLowerCase();
             var lowerB = charB.toLocaleLowerCase();
 
             result = lowerA.localeCompare(lowerB);
             if (result == 0 && lowerA != lowerB) {
                 result = (lowerA < lowerB) ? -1 : 1;
             }
 
             if (result == 0) {
                 if (caseDiff == 0 && charA != charB) { // first most left difference in case is the only important one
                     caseDiff = (charA < charB) ? -1 : 1;
                 }
             } else {
                 break;
             }
         }
 
         if (result == 0) {
             if (textA.length != textB.length) {
                 result = (textA.length < textB.length) ? -1 : 1;
             } else {
                 result = caseDiff; // if the strings are identical let the case sensitive difference decide
             }
         }
 
         return result;
     }

Po jej použití som už dosahoval celkom uspokojivé výsledky:

"cudzí", "čučoriedka", "trstina", "tŕň", "ťava"

Dodatočná poznámka z nasledujúceho dňa: bodaj by porazilo celý ten javascript. Metóda localeCompare funguje na každom prehliadači inak, dokonca sa mi zdá, že je aj rozdiel medzi IE verziami (vďaka čomu som si ešte aj túto metódu musel naprogramovať sám - a áno, nakoniec som vymenovával znaky s diakritikou :(). Kde toto všetko skončí.

Matej Tymes's Weblog

Friday, May 6, 2011

Javascript localization hell

Wednesday, May 4, 2011

Porovnanie stringov s diakritikou v javascripte