Guess_Language

# Copyright (c) 2001 LOGILAB S.A. (Paris, FRANCE).
# http://www.logilab.fr/ -- mailto:contact@logilab.fr
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

"""Guess_language tries to guess the language of an HTML page using a
simple scoring technique (coefficients were chosed by hand, but we may end
up using a learning algorithm later on).

For now, it can recognize 6 languages: italian, spanish, german, portuguese,
english and french.
"""

__revision__ = "$Id: guess_language.py,v 1.4 2001/05/05 10:37:05 nico Exp $"

import source_vector

import string
import re

# data #########################################################################

DICT = {

    'en':{ # -- ENGLISH
    # words
    re.compile('^yes$'): 1, re.compile('^the$'):1, re.compile('^you$'):1,
    re.compile('^have$'):1, re.compile('^is$'): 1, re.compile('^and$'):1,
    re.compile('^by$'):  1, re.compile('^for$'):1, re.compile('^who$'):1,
    re.compile('^when$'):1, re.compile('^why$'):1, re.compile('^in$'):0.2,
    re.compile('^to$'):0.2,
    # end of words
    re.compile('.+y$'):0.3, re.compile('.+ght$'):0.2, re.compile('.+ing$'):0.1
    },

    'it':{ # -- ITALIAN
    # words 
    re.compile('^gli$'):  1, re.compile('^di$'):  1, re.compile('^nel$'):1,
    re.compile('^ed$'):   1, re.compile('^per$'): 1, re.compile('^i$'):1,
    re.compile('^della$'):1, re.compile('^sono$'):1, re.compile('^dell$'):2,
    re.compile('^e$'):  0.3, re.compile('^con$'):0.2,re.compile('^pi\371$'):0.7,
    # end of words
    re.compile('.+\340$'): 0.2, re.compile('.+zione$'):0.6,
    re.compile('.+zioni$'):0.6, re.compile('.+zie$'):  0.3,
    re.compile('.+zia$'):  0.3, re.compile('.+ieri$'): 0.1,
    re.compile('.+i$'):   0.05
    },
    
    'de':{ # -- GERMAN
    # words
    re.compile('^der$'):1,  re.compile('^das$'):  1, re.compile('^ich$'):1,
    re.compile('^und$'):1,  re.compile('^nicht$'):1, re.compile('^f\374r$'):1,
    re.compile('^von$'):0.3,re.compile('^die$'):0.1,
    # end of words 
    re.compile('.+en$'):0.1,re.compile('.+ag$'):0.2,
    # patterns
    re.compile('.*isch.*'):0.2, re.compile('.*cht.*'):0.2,
    re.compile('.*wert.*'):0.5, re.compile('.*ung.*'):0.2,
    re.compile('.*k[aiou\366].*'):0.2
    },
    
    'es':{ # -- SPANISH
    # words
    re.compile('^del$'):1,re.compile('^los$'):1,re.compile('^las$'):1,
    re.compile('^este$'):1,re.compile('^el$'):1,re.compile('^por$'):1,
    re.compile('^y$'):0.2,
    # end of words
    re.compile('.+ci\363n$'):0.2,re.compile('.+ad$'):0.2,re.compile('.+ar$'):0.2
    },

    'pt':{ # -- PORTUGUESE
    # words
    re.compile('^do$'):1,re.compile('^seu$'):1,re.compile('^seus$'):1,
    #patterns
    re.compile('^.*\343o.*$'):1,re.compile('^.*\365e.*$'):1,re.compile('^.*\365a.*$'):1,
    #end of words
    re.compile('^.+\352$'):1
    },
    
    'fr':{ # -- FRENCH
    # words
    re.compile('^le$'): 1, re.compile('^des$'): 1, re.compile('^\340$'):1,
    re.compile('^et$'): 1, re.compile('^pour$'):1, re.compile('^par$'): 1,
    re.compile('^est$'):1, re.compile('^vous$'):1, re.compile('^nous$'):1,
    re.compile('^ils$'):1, re.compile('^qui$'): 1, re.compile('^quoi$'):1,
    re.compile('^de$'):0.3,re.compile('^il$'):0.2, re.compile('^l$'): 0.3,
    # patterns
    re.compile('^.+\350.*$'):0.1,re.compile('^.*\351.*\351.*$'):0.8
    }

    } # -- end of DICT
    

# SPECIAL_CHARACTERS
SCHARS   = {
    'en':{},
    'it':{},
    'de':{'\337':1,'\354':0.5,'\344':0.5,'\366':0.5,'\374':0.5},
    'es':{'\361':1,'\355':0.2,'\363':0.5},
    'pt':{'\343':1,'\365':1},
    'fr':{'\350':0.1,'\351':0.1}
    }


# CONDITIONS
U_RATIO   = {'en':5,'it':5,'de':5,'es':5,'pt':5,'fr':5}
U_NBWORDS = {'en':10,'it':10,'de':10,'es':10,'pt':7,'fr':10}
U_SCHARS  = {'de':5,'es':5,'pt':4,'fr':6}

COUPLED_CONDITIONS = {
                      'de':[3,4],
                      'es':[3,3],
                      'pt':[3,2],
                      }

# functions ####################################################################


# This function is called when we're not sure of the text's language
# The result will be the value returned by max(dict).

def max(dict):
    """
    for each key of the dictionnary passed in argument, compute
    (dict[key][0]*(dict[key][1]+1)),
    and return the key corresponding to the highest value found.
    """
    r = None
    max = 0
    for key in dict.keys() :
        if r == None :
            r = key
            max = dict[key][0]*(dict[key][1]+1)
        else:
            x = dict[key][0]
            y = dict[key][1] + 1
            if x*y > max:
                r= key
                max = x*y
    return r


def test_lang(lang,word_list,enough = 1):
    """
    Tests if the list of words matches a given language. The general idea is,
    for each word in a text, to try to find if this word match any singularity
    (special word, special character, etc.) of the known languages.

    Arguments:
      lang       the language ('en', 'fr', 'de', ...)
      word_list  a list of couples (word, # occurences in the text)
      enough     less than 20 different words is not enough

    Return value:
      if we're sure of the text's language : return 1
      else : return a couple representing the number of special patterns and of
      special characters found
    """
    
    nb_words, special_char, total_nb_words = 0, 0, 0

    for word, nb_occurs in word_list:
        total_nb_words = total_nb_words + nb_occurs
        
        # for the specified language, tests if one of the predefined pattern
        # matches the word and then increments consequently nb_words
        try:
            regexp_dict = DICT[lang]
            for pattern in regexp_dict.keys():
                if pattern.match(word):
                    nb_words = nb_words + regexp_dict[pattern]*nb_occurs
                    break
        except KeyError:
            print 'No such language as', lang
            pass

        # same thing but with special characters instead of patterns
        try:
            schars_dict = SCHARS[lang]
            for spec_char in schars_dict.keys():
                try:
                    string.index(word,spec_char)
                    special_char = special_char+schars_dict[spec_char]*nb_occurs
                except ValueError:
                    pass
        except KeyError:
            pass 
    
    # if not enough words in the text => decide later
    if not enough:
        if nb_words == 1:
            nb_words = 0.9
        #print "*"*60
        return nb_words,special_char

    # % 
    ratio = nb_words*100/total_nb_words

    
    try:          
        c_ratio = COUPLED_CONDITIONS[lang][0]
        c_schar = COUPLED_CONDITIONS[lang][1]
        # if this test works, we're sure of the language => return 1
        if ratio > c_ratio and special_char > c_schar:
            return 1,special_char
    except KeyError:
        pass

     # if this test works, we're sure of the language => return 1
    if ratio > U_RATIO[lang]:
        return 1,special_char

    try:
        # if this test works, we're sure of the language => return 1
        if special_char > U_SCHARS[lang]:
            return 1,special_char
    except KeyError:
        pass
    
    # Here, we'll decide later
    if nb_words == 1:
        nb_words = 0.9

    return nb_words,special_char
    

            
def identify_language_from_url(url):
    """
    This function takes an url, and returns the language of
    the url text ('en','de', ...)
    """
    
    var = {
        'nb_word':0,       # word total number in the text
        'nb_diff_word':0,  # different word number in the text
        'nb_max_occurs':0, # occurs number for the word that's most in the text
        'indice_num':-1,   # number of the indice of the list_vector
                           # that represents the text theme
        'nb_repr_word':0,  # word number in the representative list
        }

    # retrieve the text from the url
    data = source_vector.parse_html_url(url)
    
    # sort the text by number of words' occurences
    list_vector,var = source_vector.count_words_occurs(data,var)

    languageList = ['en','fr','it','pt','es','de']
    res_dict={}
    enough = 1

    # warn if not enough different words in the text
    if len(list_vector) < 20:
        if not list_vector:
            print "No words in this document !"
            return 'en'
        enough = 0
        print 'Warning : only ',len(list_vector),' different words !'

    # try to guess the language
    for language in languageList:
        nb_words, schars = test_lang(language,list_vector,enough)
        res_dict[language] = [nb_words,schars]

        if not enough:
            continue
        if nb_words == 1:
            return language
    
    return max(res_dict)


# test #########################################################################

def browse_directory(dir_path,fileList):
    """
    This function opens a directory, then for all its files, reads each line
    (corresponding to an url) and guess its language
    """
    res_file = open("/home/adim/proxy/python/results","w")
    for file in fileList :
        lang = file
        filePath=dir_path+'/'+file
        f = open(filePath)
        urlList = f.readlines()
        print "*"*20+' '+lang+' '+"*"*20
        res_file.write("*"*20+' '+lang+' '+"*"*20)
        res_file.write("\n")
        for url in urlList:
            try:
                print "url : ",url,
                flang = identify_language_from_url(url)
                print 'expected "',lang,'", found "',flang,'"'   
            except:
                pass

        res_file.flush()
    res_file.close()

def browse_file(filePath):
    """
    This function opens a file, and then for each line of the file (corresponding
    to an url), guess the url text language
    """
    try:
        f = open(filePath)
        urlList = f.readlines()
        parsedPath=string.split(path,'/')
        lang=parsedPath[-1]
        
        for url in urlList:
            print "url : ",url,
            try:
                flang = identify_language_from_url(url)
                print 'expected "',lang,'", found "',flang,'"'
            except:
                import traceback
                traceback.print_exc()
                pass
    except:
        print sys.exc_info()[0], 'open ',filePath,' failed'


# main #########################################################################

if __name__ == '__main__':
    import sys
    import os
    args = sys.argv[1:]
        
    if args :
        for path in args :
            try:
                lang = identify_language_from_url(path)
                print path,"-->",lang
            except Exception, e:
                pass
    else:
        print "Usage: guess_language.py URL ..."
        print "As in guess_language http://www.logilab.fr/ "+\
              "file:///home/user/file.html"

Tutorials

Linux System Admin Tips: There are over 200 Linux tips and tricks in this article. That is over 100 pages covering everything from NTP, setting up 2 IP address on one NIC, sharing directories among several users, putting running jobs in the background, find out who is doing what on your system by examining open sockets and the ps command, how to watch a file, how to prevent even root from deleting a file, tape commands, setting up cron jobs, using rsync, using screen conveniently with emacs, how to kill every process for a user, security tips and a lot more. These tip grow weekly. The above link will download the text version for easy grep searching. There is also an html version here.

Breaking Firewalls with OpenSSH and PuTTY: If the system administrator deliberately filters out all traffic except port 22 (ssh), to a single server, it is very likely that you can still gain access other computers behind the firewall. This article shows how remote Linux and Windows users can gain access to firewalled samba, mail, and http servers. In essence, it shows how openSSH and Putty can be used as a VPN solution for your home or workplace.

MySQL Tips and Tricks: Find out who is doing what in MySQL and how to kill the process, create binary log files, connect, create and select with Perl and Java, remove duplicates in a table with the index command, rollback and how to apply, merging several tables into one, updating foreign keys, monitor port 3306 with the tcpdump command, creating a C API, complex selects, and much more.

Create a Live Linux CD - BusyBox and OpenSSH Included: These steps will show you how to create a functioning Linux system, with the latest 2.6 kernel compiled from source, and how to integrate the BusyBox utilities including the installation of DHCP. Plus, how to compile in the OpenSSH package on this CD based system. On system boot-up a filesystem will be created and the contents from the CD will be uncompressed and completely loaded into RAM -- the CD could be removed at this point for boot-up on a second computer. The remaining functioning system will have full ssh capabilities. You can take over any PC assuming, of course, you have configured the kernel with the appropriate drivers and the PC can boot from a CD. This tutorial steps you through the whole processes.

SQLite Tutorial : This article explores the power and simplicity of sqlite3, first by starting with common commands and triggers, then the attach statement with the union operation is introduced in a way that allows multiple tables, in separate databases, to be combined as one virtual table, without the overhead of copying or moving data. Next, the simple sign function and the amazingly powerful trick of using this function in SQL select statements to solve complex queries with a single pass through the data is demonstrated, after making a brief mathematical case for how the sign function defines the absolute value and IF conditions.

The Lemon Parser Tutorial: This article explains how to build grammars and programs using the lemon parser, which is faster than yacc. And, unlike yacc, it is thread safe.

How to Compile the 2.6 kernel for Red Hat 9 and 8.0 and get Fedora Updates: This is a step by step tutorial on how to compile the 2.6 kernel from source.

Virtual Filesystem: Building A Linux Filesystem From An Ordinary File. You can take a disk file, format it as ext2, ext3, or reiser filesystem and then mount it, just like a physical drive. Yes, it then possible to read and write files to this newly mounted device. You can also copy the complete filesystem, since it is just a file, to another computer. If security is an issue, read on. This article will show you how to encrypt the filesystem, and mount it with ACL (Access Control Lists), which give you rights beyond the traditional read (r) write (w) and execute (x) for the 3 user groups file, owner and other.

Working With Time: What? There are 61 seconds in a minute? We can go back in time? We still tell time by the sun?

Chirico img Mike Chirico, a father of triplets (all girls) lives outside of Philadelphia, PA, USA. He has worked with Linux since 1996, has a Masters in Computer Science and Mathematics from Villanova University, and has worked in computer-related jobs from Wall Street to the University of Pennsylvania. His hero is Paul Erdos, a brilliant number theorist who was known for his open collaboration with others.

Mike's notes page is souptonuts. For open source consulting needs, please send an email to mchirico@gmail.com. All consulting work must include a donation to SourceForge.net.