这里会显示出您选择的修订版和当前版本之间的差别。
— |
python-files:dbf [2010/08/01 12:54] (当前版本) |
||
---|---|---|---|
行 1: | 行 1: | ||
+ | ====== Python 读取DBF/FPT 文件 ====== | ||
+ | |||
+ | DBF/FPT是 FoxPro 数据库存储文件的格式,虽然现在 FoxPro 已经不用了,但是有些情况下我们需要读取DBF/FPT数据库文件。这里整理了用Python读取DBF/FPT文件相关 Python 模块。 | ||
+ | |||
+ | ===== FPT 文件格式 ===== | ||
+ | |||
+ | FPT文件是FoxPro存储备注信息的文件。 | ||
+ | |||
+ | <code> | ||
+ | The file format is used by Fox Pro 2.x and later The size of the header is 512 bytes | ||
+ | |||
+ | _______________________ _______ | ||
+ | 00h / 0 | Number of next | ^ | ||
+ | 00h / 1 | available block | | | ||
+ | 00h / 2 | for appending data | Header | ||
+ | 00h / 3 | (binary) *1| | | ||
+ | |-----------------------| | | ||
+ | 00h / 4 | ( Reserved ) | | | ||
+ | 00h / 5 | | | | ||
+ | |-----------------------| | | ||
+ | 00h / 6 | Size of blocks N *1| | | ||
+ | 00h / 7 | *2| | | ||
+ | |-----------------------| | | ||
+ | 00h / 8 | ( Reserved ) | | | ||
+ | | | | | ||
+ | | | | | ||
+ | | (i.e. garbage) | | | ||
+ | : : | | ||
+ | : : | | ||
+ | 00h / 511| | | | ||
+ | |=======================| _v_____ | ||
+ | 00h / 0| | ^ Used block | ||
+ | | | | __ |=======================| | ||
+ | | | | / 0| Record type *3| | ||
+ | : : | / 1| *1| | ||
+ | : : | / 2| | | ||
+ | | | | / 3| | | ||
+ | 00h / N| | | / |-----------------------| | ||
+ | |=======================| _|_____/ 4| Length of memo field | | ||
+ | 00h / 0| | | 5| *1| | ||
+ | : : | 6| | | ||
+ | : : | 7| | | ||
+ | | | | |-----------------------| | ||
+ | 00h / N| | _|_____ 8| Memo data | | ||
+ | |=======================| | \ : : | ||
+ | 0| | | \ N| | | ||
+ | | | | \_____ |=======================| | ||
+ | | | | | ||
+ | : : | | ||
+ | 00h / N| | _v_____ | ||
+ | |=======================| | ||
+ | |||
+ | </code> | ||
+ | |||
+ | - Big-endian. Binary value with high byte first. | ||
+ | - Size of blocks in memo file (SET BLOCKSIZE). Default is 512 bytes. | ||
+ | - Record type | ||
+ | <code> | ||
+ | Value Description | ||
+ | 00h Picture. This normally indicates that file is produced on a MacIntosh, since pictures on the DOS/Windows platform are "objects". | ||
+ | 01h Memo | ||
+ | 02h Object | ||
+ | </code> | ||
+ | A memo field can be longer than the 512 byte block. It simply continues through the next block. The field is logically terminated by two End-of-file marks in the field. The reminder of the block is unused. | ||
+ | |||
+ | ===== Python DBF模块===== | ||
+ | |||
+ | ==== DBF.PY ==== | ||
+ | |||
+ | * http://www.physics.ox.ac.uk/users/santoso/dbf.py.src | ||
+ | |||
+ | <code python> | ||
+ | """ | ||
+ | This is a DBF reader which reads Visual Fox Pro DBF format with Memo field. | ||
+ | |||
+ | Usage: | ||
+ | rec = readDbf('test.dbf') | ||
+ | for line in rec: | ||
+ | print line['name'] | ||
+ | |||
+ | @author Yusdi Santoso | ||
+ | @date 13/07/2007 | ||
+ | """ | ||
+ | import struct | ||
+ | import os, os.path | ||
+ | import sys | ||
+ | import csv | ||
+ | import tempfile | ||
+ | import ConfigParser | ||
+ | |||
+ | class Dbase: | ||
+ | def __init__(self): | ||
+ | self.fdb = None | ||
+ | self.fmemo = None | ||
+ | self.db_data = None | ||
+ | self.memo_data = None | ||
+ | self.fields = None | ||
+ | self.num_records = 0 | ||
+ | self.header = None | ||
+ | self.memo_file = '' | ||
+ | self.memo_header = None | ||
+ | self.memo_block_size = 0 | ||
+ | self.memo_header_len = 0 | ||
+ | |||
+ | def _drop_after_NULL(self, txt): | ||
+ | for i in range(0, len(txt)): | ||
+ | if ord(struct.unpack('c', txt[i])[0])==0: | ||
+ | return txt[:i] | ||
+ | return txt | ||
+ | |||
+ | def _reverse_endian(self, num): | ||
+ | if not len(num): | ||
+ | return 0 | ||
+ | val = struct.unpack('<L', num) | ||
+ | val = struct.pack('>L', val[0]) | ||
+ | val = struct.unpack('>L', val) | ||
+ | return val[0] | ||
+ | |||
+ | def _assign_ids(self, lst, ids): | ||
+ | result = {} | ||
+ | idx = 0 | ||
+ | for item in lst: | ||
+ | id = ids[idx] | ||
+ | result[id] = item | ||
+ | idx += 1 | ||
+ | return result | ||
+ | |||
+ | def open(self, db_name): | ||
+ | filesize = os.path.getsize(db_name) | ||
+ | if filesize <= 68: | ||
+ | raise IOError, 'The file is not large enough to be a dbf file' | ||
+ | |||
+ | self.fdb = open(db_name, 'rb') | ||
+ | |||
+ | self.memo_file = '' | ||
+ | if os.path.isfile(db_name[0:-1] + 't'): | ||
+ | self.memo_file = db_name[0:-1] + 't' | ||
+ | elif os.path.isfile(db_name[0:-3] + 'fpt'): | ||
+ | self.memo_file = db_name[0:-3] + 'fpt' | ||
+ | |||
+ | if self.memo_file: | ||
+ | #Read memo file | ||
+ | self.fmemo = open(self.memo_file, 'rb') | ||
+ | self.memo_data = self.fmemo.read() | ||
+ | self.memo_header = self._assign_ids(struct.unpack('>6x1H', self.memo_data[:8]), ['Block size']) | ||
+ | block_size = self.memo_header['Block size'] | ||
+ | if not block_size: | ||
+ | block_size = 512 | ||
+ | self.memo_block_size = block_size | ||
+ | self.memo_header_len = block_size | ||
+ | memo_size = os.path.getsize(self.memo_file) | ||
+ | |||
+ | #Start reading data file | ||
+ | data = self.fdb.read(32) | ||
+ | self.header = self._assign_ids(struct.unpack('<B 3B L 2H 20x', data), ['id', 'Year', 'Month', 'Day', '# of Records', 'Header Size', 'Record Size']) | ||
+ | self.header['id'] = hex(self.header['id']) | ||
+ | |||
+ | self.num_records = self.header['# of Records'] | ||
+ | data = self.fdb.read(self.header['Header Size']-34) | ||
+ | self.fields = {} | ||
+ | x = 0 | ||
+ | header_pattern = '<11s c 4x B B 14x' | ||
+ | ids = ['Field Name', 'Field Type', 'Field Length', 'Field Precision'] | ||
+ | pattern_len = 32 | ||
+ | for offset in range(0, len(data), 32): | ||
+ | if ord(data[offset])==0x0d: | ||
+ | break | ||
+ | x += 1 | ||
+ | data_subset = data[offset: offset+pattern_len] | ||
+ | if len(data_subset) < pattern_len: | ||
+ | data_subset += ' '*(pattern_len-len(data_subset)) | ||
+ | self.fields[x] = self._assign_ids(struct.unpack(header_pattern, data_subset), ids) | ||
+ | self.fields[x]['Field Name'] = self._drop_after_NULL(self.fields[x]['Field Name']) | ||
+ | |||
+ | self.fdb.read(3) | ||
+ | if self.header['# of Records']: | ||
+ | data_size = (self.header['# of Records'] * self.header['Record Size']) - 1 | ||
+ | self.db_data = self.fdb.read(data_size) | ||
+ | else: | ||
+ | self.db_data = '' | ||
+ | self.row_format = '<' | ||
+ | self.row_ids = [] | ||
+ | self.row_len = 0 | ||
+ | for key in self.fields: | ||
+ | field = self.fields[key] | ||
+ | self.row_format += '%ds ' % (field['Field Length']) | ||
+ | self.row_ids.append(field['Field Name']) | ||
+ | self.row_len += field['Field Length'] | ||
+ | |||
+ | def close(self): | ||
+ | if self.fdb: | ||
+ | self.fdb.close() | ||
+ | if self.fmemo: | ||
+ | self.fmemo.close() | ||
+ | |||
+ | def get_numrecords(self): | ||
+ | return self.num_records | ||
+ | |||
+ | def get_record_with_names(self, rec_no): | ||
+ | """ | ||
+ | This function accept record number from 0 to N-1 | ||
+ | """ | ||
+ | if rec_no < 0 or rec_no > self.num_records: | ||
+ | raise Exception, 'Unable to extract data outside the range' | ||
+ | |||
+ | offset = self.header['Record Size'] * rec_no | ||
+ | data = self.db_data[offset:offset+self.row_len] | ||
+ | record = self._assign_ids(struct.unpack(self.row_format, data), self.row_ids) | ||
+ | |||
+ | if self.memo_file: | ||
+ | for key in self.fields: | ||
+ | field = self.fields[key] | ||
+ | f_type = field['Field Type'] | ||
+ | f_name = field['Field Name'] | ||
+ | c_data = record[f_name] | ||
+ | |||
+ | if f_type=='M' or f_type=='G' or f_type=='B' or f_type=='P': | ||
+ | c_data = self._reverse_endian(c_data) | ||
+ | if c_data: | ||
+ | record[f_name] = self.read_memo(c_data-1).strip() | ||
+ | else: | ||
+ | record[f_name] = c_data.strip() | ||
+ | return record | ||
+ | |||
+ | def read_memo_record(self, num, in_length): | ||
+ | """ | ||
+ | Read the record of given number. The second parameter is the length of | ||
+ | the record to read. It can be undefined, meaning read the whole record, | ||
+ | and it can be negative, meaning at most the length | ||
+ | """ | ||
+ | if in_length < 0: | ||
+ | in_length = -self.memo_block_size | ||
+ | |||
+ | offset = self.memo_header_len + num * self.memo_block_size | ||
+ | self.fmemo.seek(offset) | ||
+ | if in_length<0: | ||
+ | in_length = -in_length | ||
+ | if in_length==0: | ||
+ | return '' | ||
+ | return self.fmemo.read(in_length) | ||
+ | | ||
+ | def read_memo(self, num): | ||
+ | result = '' | ||
+ | buffer = self.read_memo_record(num, -1) | ||
+ | if len(buffer)<=0: | ||
+ | return '' | ||
+ | length = struct.unpack('>L', buffer[4:4+4])[0] + 8 | ||
+ | |||
+ | block_size = self.memo_block_size | ||
+ | if length < block_size: | ||
+ | return buffer[8:length] | ||
+ | rest_length = length - block_size | ||
+ | rest_data = self.read_memo_record(num+1, rest_length) | ||
+ | if len(rest_data)<=0: | ||
+ | return '' | ||
+ | return buffer[8:] + rest_data | ||
+ | |||
+ | def readDbf(filename): | ||
+ | """ | ||
+ | Read the DBF file specified by the filename and | ||
+ | return the records as a list of dictionary. | ||
+ | @param filename File name of the DBF | ||
+ | @return List of rows | ||
+ | """ | ||
+ | db = Dbase() | ||
+ | db.open(filename) | ||
+ | num = db.get_numrecords() | ||
+ | rec = [] | ||
+ | for i in range(0, num): | ||
+ | record = db.get_record_with_names(i) | ||
+ | rec.append(record) | ||
+ | db.close() | ||
+ | return rec | ||
+ | |||
+ | if __name__=='__main__': | ||
+ | rec = readDbf('dbf/sptable.dbf') | ||
+ | for line in rec: | ||
+ | print '%s %s' % (line['GENUS'].strip(), line['SPECIES'].strip()) | ||
+ | |||
+ | </code> | ||
+ | |||
+ | ==== dbfpy ==== | ||
+ | |||
+ | * http://sourceforge.net/projects/dbfpy/ | ||
+ | |||
+ | |||
+ | |||
+ | |||