# Copyright (c) 2001 LOGILAB S.A. (Paris, FRANCE). # http://www.logilab.fr/ -- mailto:contact@logilab.fr # # This program is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free Software # Foundation; either version 2 of the License, or (at your option) any later # version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along with # this program; if not, write to the Free Software Foundation, Inc., # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """Guess_language tries to guess the language of an HTML page using a simple scoring technique (coefficients were chosed by hand, but we may end up using a learning algorithm later on). For now, it can recognize 6 languages: italian, spanish, german, portuguese, english and french. """ __revision__ = "$Id: guess_language.py,v 1.4 2001/05/05 10:37:05 nico Exp $" import source_vector import string import re # data ######################################################################### DICT = { 'en':{ # -- ENGLISH # words re.compile('^yes$'): 1, re.compile('^the$'):1, re.compile('^you$'):1, re.compile('^have$'):1, re.compile('^is$'): 1, re.compile('^and$'):1, re.compile('^by$'): 1, re.compile('^for$'):1, re.compile('^who$'):1, re.compile('^when$'):1, re.compile('^why$'):1, re.compile('^in$'):0.2, re.compile('^to$'):0.2, # end of words re.compile('.+y$'):0.3, re.compile('.+ght$'):0.2, re.compile('.+ing$'):0.1 }, 'it':{ # -- ITALIAN # words re.compile('^gli$'): 1, re.compile('^di$'): 1, re.compile('^nel$'):1, re.compile('^ed$'): 1, re.compile('^per$'): 1, re.compile('^i$'):1, re.compile('^della$'):1, re.compile('^sono$'):1, re.compile('^dell$'):2, re.compile('^e$'): 0.3, re.compile('^con$'):0.2,re.compile('^pi\371$'):0.7, # end of words re.compile('.+\340$'): 0.2, re.compile('.+zione$'):0.6, re.compile('.+zioni$'):0.6, re.compile('.+zie$'): 0.3, re.compile('.+zia$'): 0.3, re.compile('.+ieri$'): 0.1, re.compile('.+i$'): 0.05 }, 'de':{ # -- GERMAN # words re.compile('^der$'):1, re.compile('^das$'): 1, re.compile('^ich$'):1, re.compile('^und$'):1, re.compile('^nicht$'):1, re.compile('^f\374r$'):1, re.compile('^von$'):0.3,re.compile('^die$'):0.1, # end of words re.compile('.+en$'):0.1,re.compile('.+ag$'):0.2, # patterns re.compile('.*isch.*'):0.2, re.compile('.*cht.*'):0.2, re.compile('.*wert.*'):0.5, re.compile('.*ung.*'):0.2, re.compile('.*k[aiou\366].*'):0.2 }, 'es':{ # -- SPANISH # words re.compile('^del$'):1,re.compile('^los$'):1,re.compile('^las$'):1, re.compile('^este$'):1,re.compile('^el$'):1,re.compile('^por$'):1, re.compile('^y$'):0.2, # end of words re.compile('.+ci\363n$'):0.2,re.compile('.+ad$'):0.2,re.compile('.+ar$'):0.2 }, 'pt':{ # -- PORTUGUESE # words re.compile('^do$'):1,re.compile('^seu$'):1,re.compile('^seus$'):1, #patterns re.compile('^.*\343o.*$'):1,re.compile('^.*\365e.*$'):1,re.compile('^.*\365a.*$'):1, #end of words re.compile('^.+\352$'):1 }, 'fr':{ # -- FRENCH # words re.compile('^le$'): 1, re.compile('^des$'): 1, re.compile('^\340$'):1, re.compile('^et$'): 1, re.compile('^pour$'):1, re.compile('^par$'): 1, re.compile('^est$'):1, re.compile('^vous$'):1, re.compile('^nous$'):1, re.compile('^ils$'):1, re.compile('^qui$'): 1, re.compile('^quoi$'):1, re.compile('^de$'):0.3,re.compile('^il$'):0.2, re.compile('^l$'): 0.3, # patterns re.compile('^.+\350.*$'):0.1,re.compile('^.*\351.*\351.*$'):0.8 } } # -- end of DICT # SPECIAL_CHARACTERS SCHARS = { 'en':{}, 'it':{}, 'de':{'\337':1,'\354':0.5,'\344':0.5,'\366':0.5,'\374':0.5}, 'es':{'\361':1,'\355':0.2,'\363':0.5}, 'pt':{'\343':1,'\365':1}, 'fr':{'\350':0.1,'\351':0.1} } # CONDITIONS U_RATIO = {'en':5,'it':5,'de':5,'es':5,'pt':5,'fr':5} U_NBWORDS = {'en':10,'it':10,'de':10,'es':10,'pt':7,'fr':10} U_SCHARS = {'de':5,'es':5,'pt':4,'fr':6} COUPLED_CONDITIONS = { 'de':[3,4], 'es':[3,3], 'pt':[3,2], } # functions #################################################################### # This function is called when we're not sure of the text's language # The result will be the value returned by max(dict). def max(dict): """ for each key of the dictionnary passed in argument, compute (dict[key][0]*(dict[key][1]+1)), and return the key corresponding to the highest value found. """ r = None max = 0 for key in dict.keys() : if r == None : r = key max = dict[key][0]*(dict[key][1]+1) else: x = dict[key][0] y = dict[key][1] + 1 if x*y > max: r= key max = x*y return r def test_lang(lang,word_list,enough = 1): """ Tests if the list of words matches a given language. The general idea is, for each word in a text, to try to find if this word match any singularity (special word, special character, etc.) of the known languages. Arguments: lang the language ('en', 'fr', 'de', ...) word_list a list of couples (word, # occurences in the text) enough less than 20 different words is not enough Return value: if we're sure of the text's language : return 1 else : return a couple representing the number of special patterns and of special characters found """ nb_words, special_char, total_nb_words = 0, 0, 0 for word, nb_occurs in word_list: total_nb_words = total_nb_words + nb_occurs # for the specified language, tests if one of the predefined pattern # matches the word and then increments consequently nb_words try: regexp_dict = DICT[lang] for pattern in regexp_dict.keys(): if pattern.match(word): nb_words = nb_words + regexp_dict[pattern]*nb_occurs break except KeyError: print 'No such language as', lang pass # same thing but with special characters instead of patterns try: schars_dict = SCHARS[lang] for spec_char in schars_dict.keys(): try: string.index(word,spec_char) special_char = special_char+schars_dict[spec_char]*nb_occurs except ValueError: pass except KeyError: pass # if not enough words in the text => decide later if not enough: if nb_words == 1: nb_words = 0.9 #print "*"*60 return nb_words,special_char # % ratio = nb_words*100/total_nb_words try: c_ratio = COUPLED_CONDITIONS[lang][0] c_schar = COUPLED_CONDITIONS[lang][1] # if this test works, we're sure of the language => return 1 if ratio > c_ratio and special_char > c_schar: return 1,special_char except KeyError: pass # if this test works, we're sure of the language => return 1 if ratio > U_RATIO[lang]: return 1,special_char try: # if this test works, we're sure of the language => return 1 if special_char > U_SCHARS[lang]: return 1,special_char except KeyError: pass # Here, we'll decide later if nb_words == 1: nb_words = 0.9 return nb_words,special_char def identify_language_from_url(url): """ This function takes an url, and returns the language of the url text ('en','de', ...) """ var = { 'nb_word':0, # word total number in the text 'nb_diff_word':0, # different word number in the text 'nb_max_occurs':0, # occurs number for the word that's most in the text 'indice_num':-1, # number of the indice of the list_vector # that represents the text theme 'nb_repr_word':0, # word number in the representative list } # retrieve the text from the url data = source_vector.parse_html_url(url) # sort the text by number of words' occurences list_vector,var = source_vector.count_words_occurs(data,var) languageList = ['en','fr','it','pt','es','de'] res_dict={} enough = 1 # warn if not enough different words in the text if len(list_vector) < 20: if not list_vector: print "No words in this document !" return 'en' enough = 0 print 'Warning : only ',len(list_vector),' different words !' # try to guess the language for language in languageList: nb_words, schars = test_lang(language,list_vector,enough) res_dict[language] = [nb_words,schars] if not enough: continue if nb_words == 1: return language return max(res_dict) # test ######################################################################### def browse_directory(dir_path,fileList): """ This function opens a directory, then for all its files, reads each line (corresponding to an url) and guess its language """ res_file = open("/home/adim/proxy/python/results","w") for file in fileList : lang = file filePath=dir_path+'/'+file f = open(filePath) urlList = f.readlines() print "*"*20+' '+lang+' '+"*"*20 res_file.write("*"*20+' '+lang+' '+"*"*20) res_file.write("\n") for url in urlList: try: print "url : ",url, flang = identify_language_from_url(url) print 'expected "',lang,'", found "',flang,'"' except: pass res_file.flush() res_file.close() def browse_file(filePath): """ This function opens a file, and then for each line of the file (corresponding to an url), guess the url text language """ try: f = open(filePath) urlList = f.readlines() parsedPath=string.split(path,'/') lang=parsedPath[-1] for url in urlList: print "url : ",url, try: flang = identify_language_from_url(url) print 'expected "',lang,'", found "',flang,'"' except: import traceback traceback.print_exc() pass except: print sys.exc_info()[0], 'open ',filePath,' failed' # main ######################################################################### if __name__ == '__main__': import sys import os args = sys.argv[1:] if args : for path in args : try: lang = identify_language_from_url(path) print path,"-->",lang except Exception, e: pass else: print "Usage: guess_language.py URL ..." print "As in guess_language http://www.logilab.fr/ "+\ "file:///home/user/file.html"
Linux System Admin Tips: There are over 200 Linux tips and tricks in this article. That is over 100 pages covering everything from NTP, setting up 2 IP address on one NIC, sharing directories among several users, putting running jobs in the background, find out who is doing what on your system by examining open sockets and the ps command, how to watch a file, how to prevent even root from deleting a file, tape commands, setting up cron jobs, using rsync, using screen conveniently with emacs, how to kill every process for a user, security tips and a lot more. These tip grow weekly. The above link will download the text version for easy grep searching. There is also an html version here.
Breaking Firewalls with OpenSSH and PuTTY: If the system administrator deliberately filters out all traffic except port 22 (ssh), to a single server, it is very likely that you can still gain access other computers behind the firewall. This article shows how remote Linux and Windows users can gain access to firewalled samba, mail, and http servers. In essence, it shows how openSSH and Putty can be used as a VPN solution for your home or workplace.
MySQL Tips and Tricks: Find out who is doing what in MySQL and how to kill the process, create binary log files, connect, create and select with Perl and Java, remove duplicates in a table with the index command, rollback and how to apply, merging several tables into one, updating foreign keys, monitor port 3306 with the tcpdump command, creating a C API, complex selects, and much more.
Create a Live Linux CD - BusyBox and OpenSSH Included: These steps will show you how to create a functioning Linux system, with the latest 2.6 kernel compiled from source, and how to integrate the BusyBox utilities including the installation of DHCP. Plus, how to compile in the OpenSSH package on this CD based system. On system boot-up a filesystem will be created and the contents from the CD will be uncompressed and completely loaded into RAM -- the CD could be removed at this point for boot-up on a second computer. The remaining functioning system will have full ssh capabilities. You can take over any PC assuming, of course, you have configured the kernel with the appropriate drivers and the PC can boot from a CD. This tutorial steps you through the whole processes.
SQLite Tutorial : This article explores the power and simplicity of sqlite3, first by starting with common commands and triggers, then the attach statement with the union operation is introduced in a way that allows multiple tables, in separate databases, to be combined as one virtual table, without the overhead of copying or moving data. Next, the simple sign function and the amazingly powerful trick of using this function in SQL select statements to solve complex queries with a single pass through the data is demonstrated, after making a brief mathematical case for how the sign function defines the absolute value and IF conditions.
The Lemon Parser Tutorial: This article explains how to build grammars and programs using the lemon parser, which is faster than yacc. And, unlike yacc, it is thread safe.
How to Compile the 2.6 kernel for Red Hat 9 and 8.0 and get Fedora Updates: This is a step by step tutorial on how to compile the 2.6 kernel from source.
Virtual Filesystem: Building A Linux Filesystem From An Ordinary File. You can take a disk file, format it as ext2, ext3, or reiser filesystem and then mount it, just like a physical drive. Yes, it then possible to read and write files to this newly mounted device. You can also copy the complete filesystem, since it is just a file, to another computer. If security is an issue, read on. This article will show you how to encrypt the filesystem, and mount it with ACL (Access Control Lists), which give you rights beyond the traditional read (r) write (w) and execute (x) for the 3 user groups file, owner and other.
Working With Time: What? There are 61 seconds in a minute? We can go back in time? We still tell time by the sun?
Mike Chirico, a father of triplets (all girls) lives outside of
Philadelphia, PA, USA. He has worked with Linux since 1996, has a Masters
in Computer Science and Mathematics from Villanova University, and has
worked in computer-related jobs from Wall Street to the University of
Pennsylvania. His hero is Paul Erdos, a brilliant number theorist who was
known for his open collaboration with others.
Mike's notes page is souptonuts. For
open source consulting needs, please send an email to
mchirico@gmail.com. All consulting work must include a donation to
SourceForge.net.