差别

这里会显示出您选择的修订版和当前版本之间的差别。

--- python-files:pystardict [2010/08/13 08:20]
+++ python-files:pystardict [2010/08/13 08:20] (当前版本)
@@ 行 1: / 行 1: @@
+====== PyStarDic源码 pystardict.py ======
+<code python>
+# -*- coding: utf-8 -*-
+"""
+Copyright 2008 Serge Matveenko
+This file is part of PyStarDict.
+PyStarDict is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+PyStarDict is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with PyStarDict.  If not, see <http://www.gnu.org/licenses/>.
+@author: Serge Matveenko <s@matveenko.ru>
+"""
+import gzip
+import hashlib
+import re
+from struct import unpack
+class _StarDictIfo(object):
+    """
+    The .ifo file has the following format:
+    StarDict's dict ifo file
+    version=2.4.2
+    [options]
+    Note that the current "version" string must be "2.4.2" or "3.0.0".  If it's not,
+    then StarDict will refuse to read the file.
+    If version is "3.0.0", StarDict will parse the "idxoffsetbits" option.
+    [options]
+    ---------
+    In the example above, [options] expands to any of the following lines
+    specifying information about the dictionary.  Each option is a keyword
+    followed by an equal sign, then the value of that option, then a
+    newline.  The options may be appear in any order.
+    Note that the dictionary must have at least a bookname, a wordcount and a
+    idxfilesize, or the load will fail.  All other information is optional.  All
+    strings should be encoded in UTF-8.
+    Available options:
+    bookname=      // required
+    wordcount=     // required
+    synwordcount=  // required if ".syn" file exists.
+    idxfilesize=   // required
+    idxoffsetbits= // New in 3.0.0
+    author=
+    email=
+    website=
+    description=    // You can use <br> for new line.
+    date=
+    sametypesequence= // very important.
+    """
+    def __init__(self, dict_prefix, container):
+        ifo_filename = '%s.ifo' % dict_prefix
+        try:
+            _file = open(ifo_filename)
+        except IOError:
+            raise Exception('.ifo file does not exists')
+        # skipping ifo header
+        _file.readline()
+        _line = _file.readline().split('=')
+        if _line[0] == 'version':
+            self.version = _line[1]
+        else:
+            raise Exception('ifo has invalid format')
+        _config = {}
+        for _line in _file:
+            _line_splited = _line.split('=')
+            _config[_line_splited[0]] = _line_splited[1]
+        self.bookname = _config.get('bookname', None).strip()
+        if self.bookname is None: raise Exception('ifo has no bookname')
+        self.wordcount = _config.get('wordcount', None)
+        if self.wordcount is None: raise Exception('ifo has no wordcount')
+        self.wordcount = int(self.wordcount)
+        if self.version == '3.0.0':
+            try:
+                _syn = open('%s.syn' % dict_prefix)
+                self.synwordcount = _config.get('synwordcount', None)
+                if self.synwordcount is None:
+                    raise Exception('ifo has no synwordcount but .syn file exists')
+                self.synwordcount = int(self.synwordcount)
+            except IOError:
+                pass
+        self.idxfilesize = _config.get('idxfilesize', None)
+        if self.idxfilesize is None: raise Exception('ifo has no idxfilesize')
+        self.idxfilesize = int(self.idxfilesize)
+        self.idxoffsetbits = _config.get('idxoffsetbits', 32)
+        self.idxoffsetbits = int(self.idxoffsetbits)
+        self.author = _config.get('author', '').strip()
+        self.email = _config.get('email', '').strip()
+        self.website = _config.get('website', '').strip()
+        self.description = _config.get('description', '').strip()
+        self.date = _config.get('date', '').strip()
+        self.sametypesequence = _config.get('sametypesequence', '').strip()
+class _StarDictIdx(object):
+    """
+    The .idx file is just a word list.
+    The word list is a sorted list of word entries.
+    Each entry in the word list contains three fields, one after the other:
+         word_str;  // a utf-8 string terminated by '\0'.
+         word_data_offset;  // word data's offset in .dict file
+         word_data_size;  // word data's total size in .dict file
+    """
+    def __init__(self, dict_prefix, container):
+        idx_filename = '%s.idx' % dict_prefix
+        idx_filename_gz = '%s.gz' % idx_filename
+        try:
+            file = open_file(idx_filename, idx_filename_gz)
+        except:
+            raise Exception('.idx file does not exists')
+        """ check file size """
+        self._file = file.read()
+        if file.tell() != container.ifo.idxfilesize:
+            raise Exception('size of the .idx file is incorrect')
+        """ prepare main dict and parsing parameters """
+        self._idx = {}
+        idx_offset_bytes_size = int(container.ifo.idxoffsetbits / 8)
+        idx_offset_format = {4: 'L', 8: 'Q',}[idx_offset_bytes_size]
+        idx_cords_bytes_size = idx_offset_bytes_size + 4
+        """ parse data via regex """
+        record_pattern = r'([\d\D]+?\x00[\d\D]{%s})' % idx_cords_bytes_size
+        matched_records = re.findall(record_pattern, self._file)
+        """ check records count """
+        if len(matched_records) != container.ifo.wordcount:
+            raise Exception('words count is incorrect')
+        """ unpack parsed records """
+        for matched_record in matched_records:
+            c = matched_record.find('\x00') + 1
+            record_tuple = unpack('!%sc%sL' % (c, idx_offset_format),
+                matched_record)
+            word, cords = record_tuple[:c-1], record_tuple[c:]
+            self._idx[word] = cords
+    def __getitem__(self, word):
+        """
+        returns tuple (word_data_offset, word_data_size,) for word in .dict
+        @note: here may be placed flexible search realization
+        """
+        return self._idx[tuple(word)]
+    def __contains__(self, k):
+        """
+        returns True if index has a word k, else False
+        """
+        return tuple(k) in self._idx
+    def __eq__(self, y):
+        """
+        returns True if hashlib.md5(x.idx) is equal to hashlib.md5(y.idx), else False
+        """
+        return hashlib.md5(self._file).hexdigest() == hashlib.md5(y._file).hexdigest()
+    def __ne__(self, y):
+        """
+        returns True if hashlib.md5(x.idx) is not equal to hashlib.md5(y.idx), else False
+        """
+        return not self.__eq__(y)
+class _StarDictDict(object):
+    """
+    The .dict file is a pure data sequence, as the offset and size of each
+    word is recorded in the corresponding .idx file.
+    If the "sametypesequence" option is not used in the .ifo file, then
+    the .dict file has fields in the following order:
+    ==============
+    word_1_data_1_type; // a single char identifying the data type
+    word_1_data_1_data; // the data
+    word_1_data_2_type;
+    word_1_data_2_data;
+    ...... // the number of data entries for each word is determined by
+           // word_data_size in .idx file
+    word_2_data_1_type;
+    word_2_data_1_data;
+    ......
+    ==============
+    It's important to note that each field in each word indicates its
+    own length, as described below.  The number of possible fields per
+    word is also not fixed, and is determined by simply reading data until
+    you've read word_data_size bytes for that word.
+    Suppose the "sametypesequence" option is used in the .idx file, and
+    the option is set like this:
+    sametypesequence=tm
+    Then the .dict file will look like this:
+    ==============
+    word_1_data_1_data
+    word_1_data_2_data
+    word_2_data_1_data
+    word_2_data_2_data
+    ......
+    ==============
+    The first data entry for each word will have a terminating '\0', but
+    the second entry will not have a terminating '\0'.  The omissions of
+    the type chars and of the last field's size information are the
+    optimizations required by the "sametypesequence" option described
+    above.
+    If "idxoffsetbits=64", the file size of the .dict file will be bigger
+    than 4G. Because we often need to mmap this large file, and there is
+    a 4G maximum virtual memory space limit in a process on the 32 bits
+    computer, which will make we can get error, so "idxoffsetbits=64"
+    dictionary can't be loaded in 32 bits machine in fact, StarDict will
+    simply print a warning in this case when loading. 64-bits computers
+    should haven't this limit.
+    Type identifiers
+    ----------------
+    Here are the single-character type identifiers that may be used with
+    the "sametypesequence" option in the .idx file, or may appear in the
+    dict file itself if the "sametypesequence" option is not used.
+    Lower-case characters signify that a field's size is determined by a
+    terminating '\0', while upper-case characters indicate that the data
+    begins with a network byte-ordered guint32 that gives the length of
+    the following data's size(NOT the whole size which is 4 bytes bigger).
+    'm'
+    Word's pure text meaning.
+    The data should be a utf-8 string ending with '\0'.
+    'l'
+    Word's pure text meaning.
+    The data is NOT a utf-8 string, but is instead a string in locale
+    encoding, ending with '\0'.  Sometimes using this type will save disk
+    space, but its use is discouraged.
+    'g'
+    A utf-8 string which is marked up with the Pango text markup language.
+    For more information about this markup language, See the "Pango
+    Reference Manual."
+    You might have it installed locally at:
+    file:///usr/share/gtk-doc/html/pango/PangoMarkupFormat.html
+    't'
+    English phonetic string.
+    The data should be a utf-8 string ending with '\0'.
+    Here are some utf-8 phonetic characters:
+    θʃŋʧðʒæıʌʊɒɛəɑɜɔˌˈːˑṃṇḷ
+    æɑɒʌәєŋvθðʃʒɚːɡˏˊˋ
+    'x'
+    A utf-8 string which is marked up with the xdxf language.
+    See http://xdxf.sourceforge.net
+    StarDict have these extention:
+    <rref> can have "type" attribute, it can be "image", "sound", "video"
+    and "attach".
+    <kref> can have "k" attribute.
+    'y'
+    Chinese YinBiao or Japanese KANA.
+    The data should be a utf-8 string ending with '\0'.
+    'k'
+    KingSoft PowerWord's data. The data is a utf-8 string ending with '\0'.
+    It is in XML format.
+    'w'
+    MediaWiki markup language.
+    See http://meta.wikimedia.org/wiki/Help:Editing#The_wiki_markup
+    'h'
+    Html codes.
+    'r'
+    Resource file list.
+    The content can be:
+    img:pic/example.jpg     // Image file
+    snd:apple.wav           // Sound file
+    vdo:film.avi            // Video file
+    att:file.bin            // Attachment file
+    More than one line is supported as a list of available files.
+    StarDict will find the files in the Resource Storage.
+    The image will be shown, the sound file will have a play button.
+    You can "save as" the attachment file and so on.
+    'W'
+    wav file.
+    The data begins with a network byte-ordered guint32 to identify the wav
+    file's size, immediately followed by the file's content.
+    'P'
+    Picture file.
+    The data begins with a network byte-ordered guint32 to identify the picture
+    file's size, immediately followed by the file's content.
+    'X'
+    this type identifier is reserved for experimental extensions.
+    """
+    def __init__(self, dict_prefix, container):
+        """
+        opens regular or dziped .dict file
+        """
+        self._container = container
+        dict_filename = '%s.dict' % dict_prefix
+        dict_filename_dz = '%s.dz' % dict_filename
+        try:
+            self._file = open_file(dict_filename, dict_filename_dz)
+        except:
+            raise Exception('.dict file does not exists')
+    def __getitem__(self, word):
+        """
+        returns data from .dict for word
+        """
+        # getting word data coordinats
+        cords = self._container.idx[word]
+        # seeking in file for data
+        self._file.seek(cords[0])
+        # reading data
+        bytes = self._file.read(cords[1])
+        return bytes
+class _StarDictSyn(object):
+    def __init__(self, dict_prefix, container):
+        syn_filename = '%s.syn' % dict_prefix
+        try:
+            self._file = open(syn_filename)
+        except IOError:
+            # syn file is optional, passing silently
+            pass
+class Dictionary(dict):
+    """
+    Dictionary-like class for lazy manipulating stardict dictionaries
+    All items of this dictionary are writable and dict is expandable itself,
+    but changes are not stored anywhere and available in runtime only.
+    We assume in this documentation that "x" or "y" is instances of the
+    StarDictDict class and "x.{ifo,idx{,.gz},dict{,.dz),syn}" or
+    "y.{ifo,idx{,.gz},dict{,.dz),syn}" is files of the corresponding stardict
+    dictionaries.
+    Following documentation is from the "dict" class an is subkect to rewrite
+    in further impleneted methods:
+    """
+    def __init__(self, filename_prefix):
+        """
+        filename_prefix: path to dictionary files without files extensions
+        initializes new StarDictDict instance from stardict dictionary files
+        provided by filename_prefix
+        """
+        # reading somedict.ifo
+        self.ifo = _StarDictIfo(dict_prefix=filename_prefix, container=self)
+        # reading somedict.idx or somedict.idx.gz
+        self.idx = _StarDictIdx(dict_prefix=filename_prefix, container=self)
+        # reading somedict.dict or somedict.dict.dz
+        self.dict = _StarDictDict(dict_prefix=filename_prefix, container=self)
+        # reading somedict.syn (optional)
+        self.syn = _StarDictSyn(dict_prefix=filename_prefix, container=self)
+        # initializing cache
+        self._dict_cache = {}
+    def __cmp__(self, y):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __contains__(self, k):
+        """
+        returns True if x.idx has a word k, else False
+        """
+        return k in self.idx
+    def __delitem__(self, k):
+        """
+        frees cache from word k translation
+        """
+        del self._dict_cache[k]
+    def __eq__(self, y):
+        """
+        returns True if hashlib.md5(x.idx) is equal to hashlib.md5(y.idx), else False
+        """
+        return self.idx.__eq__(y.idx)
+    def __ge__(self, y):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __getitem__(self, k):
+        """
+        returns translation for word k from cache or not and then caches
+        """
+        if k in self._dict_cache:
+            return self._dict_cache[k]
+        else:
+            value = self.dict[k]
+            self._dict_cache[k] = value
+            return value
+    def __gt__(self, y):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __iter__(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __le__(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __len__(self):
+        """
+        returns number of words provided by wordcount parameter of the x.ifo
+        """
+        return self.ifo.wordcount
+    def __lt__(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def __ne__(self, y):
+        """
+        returns True if hashlib.md5(x.idx) is not equal to hashlib.md5(y.idx), else False
+        """
+        return not self.__eq__(y)
+    def __repr__(self):
+        """
+        returns classname and bookname parameter of the x.ifo
+        """
+        return u'%s %s' % (self.__class__, self.ifo.bookname)
+    def __setitem__(self, k, v):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def clear(self):
+        """
+        clear dict cache
+        """
+        self._dict_cache = dict()
+    def get(self, k, d=''):
+        """
+        returns translation of the word k from self.dict or d if k not in x.idx
+        d defaults to empty string
+        """
+        return k in self and self[k] or d
+    def has_key(self, k):
+        """
+        returns True if self.idx has a word k, else False
+        """
+        return k in self
+    def items(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def iteritems(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def iterkeys(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def itervalues(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def keys(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def pop(self, k, d):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def popitem(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def setdefault(self, k, d):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def update(self, E, **F):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def values(self):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+    def fromkeys(self, S, v=None):
+        """
+        raises NotImplemented exception
+        """
+        raise NotImplementedError()
+def open_file(regular, gz):
+    """
+    Open regular file if it exists, gz file otherwise.
+    If no file exists, rise ValueError.
+    """
+    try:
+        return open(regular, 'rb')
+    except IOError:
+        try:
+            return gzip.open(gz, 'rb')
+        except IOError:
+            raise ValueError('Neither regular nor gz file exists')
+</code>

Python 俱乐部

用户工具

站点工具

差别

页面工具