古いバージョンのpythonとpostgreでDBのデータをxmlに変換して出力した
python2.4 とpostgreを使ってDBのバックアップを取るためにxmlとして出力する必要がありまして、その時のメモです。
方法は二つあって、DOMをメモリ上に全部もってからまとめて吐き出す方法と地道に一つずつ書き出していく方法があります。
小さなDBだったらまとめて吐き出せばいいと思うのですが、今回50万行以上のレコードがあったためそれだとメモリに乗らずに
スワップを使い切りそうでした。(実行してから教えてもらって気づいた・・・。)
以下、実例。
# -*- coding: utf-8 -*- import datetime import pgdb import xml.dom.minidom, codecs from xml.sax import saxutils HOST = "localhost" DATABASE = "sample" USER = "ayahito" PASSWORD = "" FILE_NAME = "backup.xml" STEP_NUM = 10000 def get_connecter(host, database, user, password): connecter = pgdb.connect( host=host,database=database,user=user,password=password) return connecter def get_cursor(connecter): return connecter.cursor() def get_count(connecter, tablename): cursor = get_cursor(connecter) cursor.execute(""" SELECT COUNT(*) FROM %s; """ % (tablename)) rows = cursor.fetchall() count = rows[0][0] return count def get_table_names(connecter): cursor = get_cursor(connecter) cursor.execute("""SELECT relname AS table_name FROM pg_stat_user_tables;""") rows = cursor.fetchall() rows = [row[0] for row in rows] return rows def get_table_records_and_column_names(connecter, table_name): cursor = get_cursor(connecter) cursor.execute("SELECT * from %s" % (table_name)) col_names = [cn[0] for cn in cursor.description] col_rows = cursor.fetchall() return col_rows, col_names def get_xml_doc(): doc = xml.dom.minidom.Document() return doc def create_root(doc): root = doc.createElement('root') doc.appendChild(root) return root def create_child(doc, parent, elem_name): child = doc.createElement(elem_name) parent.appendChild(child) return child def append_value_to_elem(doc, elem, value): if not isinstance(value, str): value = str(value) elem.appendChild(doc.createTextNode(value)) def write(doc, file_name): f1 = codecs.open(file_name, 'w', 'utf-8') doc.writexml(f1) f1.close() def write_root(file_name): file_name.write("""<?xml version="1.0" ?><root>""") def write_end_root(file_name): file_name.write("""</root>""") def write_start_tag(file_name, name): file_name.write("<" + saxutils.escape(name) + ">") def write_end_tag(file_name, name): file_name.write("</" + saxutils.escape(name) + ">") def write_value(file_name, value): if (isinstance(value, int) or isinstance(value, float) or isinstance(value, long)): value = str(value) elif (isinstance(value, datetime.datetime) or isinstance(value, datetime.date)): value = "%s" % (value.isoformat()) elif isinstance(value, str): value = value.decode('utf-8') else: value = str(value) file_name.write(saxutils.escape(value)) def step_execute(f, table_name, connecter): records = True last_used_id = 0 while records: cursor = get_cursor(connecter) cursor.execute( "SELECT * from %s where id > %s order by id limit %s" % ( table_name, last_used_id, STEP_NUM)) col_names = [cn[0] for cn in cursor.description] records = cursor.fetchall() write_row(f, records, col_names) if records: last_record = records[-1] last_used_id = last_record[col_names.index("id")] def write_row(f, records, col_names): for record in records: write_start_tag(f, "row") for col_name, value in zip(col_names, record): write_start_tag(f, col_name) write_value(f, value) write_end_tag(f, col_name) write_end_tag(f, "row") def execute(): doc = get_xml_doc() root = create_root(doc) connecter = get_connecter(HOST, DATABASE, USER, PASSWORD) table_names = get_table_names(connecter) for table_name in table_names: table_elem = create_child(doc, root, table_name) records, col_names = get_table_records_and_column_names( connecter, table_name) for record in records: row_elem = create_child(doc, table_elem, "row") for col_name, value in zip(col_names, record): colum_elem = create_child(doc, row_elem, col_name) append_value_to_elem(doc, colum_elem, value) write(doc, FILE_NAME) def less_memory_execute(): connecter = get_connecter(HOST, DATABASE, USER, PASSWORD) table_names = get_table_names(connecter) f = codecs.open("backup.xml","w","utf-8") write_root(f) for table_name in table_names: write_start_tag(f, table_name) if get_count(connecter, table_name) > STEP_NUM: step_execute(f, table_name, connecter) else: records, col_names = get_table_records_and_column_names( connecter, table_name) write_row(f, records, col_names) write_end_tag(f, table_name) write_end_root(f) if __name__ == "__main__": less_memory_execute()
メモリに一度に乗らないように決められた行数分だけレコードを取得して
ファイルに書き出しています。定数使ったのでSTEP_NUM書き換えれば一度に取得する値も変えられます。