From 491685e892c8fedf52d89b5678d07eb54a9e3261 Mon Sep 17 00:00:00 2001 From: Burgess Leo <1799594843@qq.com> Date: Fri, 23 May 2025 18:01:42 +0800 Subject: [PATCH] optimize db_path memory --- db_manage/clear_table_record.py | 12 ++++----- files_utils/folders_save.py | 14 ++++++++++ ntfs_utils/db_node.py | 7 +++-- ntfs_utils/db_path.py | 48 +++++++++++++-------------------- ntfs_utils/main.py | 13 +++++---- 5 files changed, 48 insertions(+), 46 deletions(-) diff --git a/db_manage/clear_table_record.py b/db_manage/clear_table_record.py index 89e8598..c019d53 100644 --- a/db_manage/clear_table_record.py +++ b/db_manage/clear_table_record.py @@ -25,10 +25,10 @@ def ClearTableRecordsWithReset(db_path, table_name): if __name__ == '__main__': ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_path') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_node') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent') - # ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent') + ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name') diff --git a/files_utils/folders_save.py b/files_utils/folders_save.py index e69de29..601881a 100644 --- a/files_utils/folders_save.py +++ b/files_utils/folders_save.py @@ -0,0 +1,14 @@ +import sqlite3 + +db_path = "../src/db_ntfs_info.db" +conn = sqlite3.connect(db_path) +cursor = conn.cursor() + + +def GetDeviceId(cursor: sqlite3.Cursor) -> int: + cursor.execute("SELECT ID FROM db_device ORDER BY ID LIMIT 1") + result = cursor.fetchone() + return result[0] if result else 0 + + +print(GetDeviceId(cursor)) diff --git a/ntfs_utils/db_node.py b/ntfs_utils/db_node.py index c928bf2..284dbfb 100644 --- a/ntfs_utils/db_node.py +++ b/ntfs_utils/db_node.py @@ -4,6 +4,7 @@ import sqlite3 from datetime import datetime from ntfs_utils.mft_analyze import GetFile80hPattern, GetFragmentData, ExtractSequenceHexValues, hex_list_to_int +from ntfs_utils.main import volume_letter # 工具函数:获取文件扩展名 @@ -131,11 +132,10 @@ def GetFragmentLength(fragment): # 主函数:将 db_path 数据导入 db_node -def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20): +def InsertNodeDataToDB(db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20): conn = sqlite3.connect(db_path) cursor = conn.cursor() - volume_letter = volume_letter.upper().strip() if len(volume_letter) == 1: volume_root = f"{volume_letter}:\\" elif volume_letter.endswith(':'): @@ -254,5 +254,4 @@ def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', tab if __name__ == "__main__": - volume_letter_test = "Y" - InsertNodeDataToDB(volume_letter=volume_letter_test) + InsertNodeDataToDB() diff --git a/ntfs_utils/db_path.py b/ntfs_utils/db_path.py index 26ac595..bb2f16f 100644 --- a/ntfs_utils/db_path.py +++ b/ntfs_utils/db_path.py @@ -23,7 +23,7 @@ def ShouldSkipPath(path: str) -> bool: return False -def ScanVolume(volume_letter: str): +def ScanVolume(volume_letter: str) -> list: """ 完整扫描指定磁盘的所有文件和目录,忽略 NTFS 元文件和系统文件夹, 并为每个节点分配 ParentID。 @@ -33,7 +33,6 @@ def ScanVolume(volume_letter: str): if not os.path.exists(root_path): raise ValueError(f"磁盘 {root_path} 不存在") - result = [] path_to_id = {} # 路径 -> ID 映射 counter = 1 @@ -61,15 +60,12 @@ def ScanVolume(volume_letter: str): name = entry - # 分离盘符并去除开头和结尾的 \ + # 分离盘符并处理路径格式 _, relative_path = os.path.splitdrive(full_path) relative_path = relative_path.lstrip("\\").rstrip("\\") - - # 如果是目录,结尾加 / if os.path.isdir(full_path) and not relative_path.endswith("/"): relative_path += "/" - # 替换所有 \ -> / relative_path = relative_path.replace("\\", "/") path_hash = GenerateHash(relative_path) @@ -97,25 +93,27 @@ def ScanVolume(volume_letter: str): "ContentSize": content_size } - result.append(item) + yield item # 使用 yield 返回每条记录 path_to_id[relative_path] = counter counter += 1 except Exception as e: print(f"⚠️ 跳过路径 {full_path},错误: {e}") - return result - -def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20): +def InsertPathDataToDB(data_generator, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20): """ - 批量将扫描结果写入数据库。 + 流式写入数据库,边扫描边入库。 + + :param data_generator: 可迭代对象(如生成器) + :param db_path: 数据库路径 + :param table_name: 表名 + :param batch_size: 每多少条记录提交一次 """ conn = sqlite3.connect(db_path) cursor = conn.cursor() try: - # 创建表(如果不存在) create_table_sql = f""" CREATE TABLE IF NOT EXISTS {table_name} ( ID INTEGER PRIMARY KEY AUTOINCREMENT, @@ -125,23 +123,20 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa IsDir INTEGER NOT NULL CHECK(IsDir IN (0, 1)), ParentID INTEGER, ContentSize INTEGER, - FOREIGN KEY(ParentID) REFERENCES {table_name}(ID) ); """ cursor.execute(create_table_sql) - # 插入语句(忽略重复 PathHash) insert_sql = f""" INSERT OR IGNORE INTO {table_name} (Path, Name, PathHash, IsDir, ParentID, ContentSize) VALUES (?, ?, ?, ?, ?, ?) """ - total_inserted = 0 batch = [] - for item in data: + for item in data_generator: batch.append(( item['Path'], item['Name'], @@ -154,39 +149,34 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa if len(batch) >= batch_size: cursor.executemany(insert_sql, batch) conn.commit() - total_inserted += cursor.rowcount print(f"✅ 提交一批 {len(batch)} 条数据") batch.clear() - # 插入剩余数据 + # 提交剩余不足一批的数据 if batch: cursor.executemany(insert_sql, batch) conn.commit() - total_inserted += cursor.rowcount print(f"✅ 提交最后一批 {len(batch)} 条数据") - print(f"✅ 总共插入 {total_inserted} 条记录到数据库。") - except Exception as e: print(f"❌ 插入失败: {e}") conn.rollback() - finally: conn.close() # 示例主函数 -def main(): - volume_letter = "Y" - +def DBPathMain(volume_letter: str): print(f"🔍 开始全盘扫描磁盘 {volume_letter}:\\ ...") - scanned_data = ScanVolume(volume_letter) - print(f"📊 共扫描到 {len(scanned_data)} 条有效记录,开始入库...") - InsertPathDataToDB(scanned_data) + # 获取生成器对象 + generator = ScanVolume(volume_letter) + + print(f"📊 开始逐批入库...") + InsertPathDataToDB(generator) print("✅ 全盘扫描与入库完成") if __name__ == "__main__": - main() + DBPathMain(volume_letter="Y") diff --git a/ntfs_utils/main.py b/ntfs_utils/main.py index 0fe468b..f25a6b7 100644 --- a/ntfs_utils/main.py +++ b/ntfs_utils/main.py @@ -2,8 +2,8 @@ from ntfs_utils.db_config import GetNTFSBootInfo, InsertInfoToDBConfig from ntfs_utils.db_device import ScanSpecialVolumes, InsertVolumesToDB from ntfs_utils.db_extend_name import InsertExtensionsToDB from ntfs_utils.db_group import InsertGroupToDB -from ntfs_utils.db_node import InsertNodeDataToDB -from ntfs_utils.db_path import ScanVolume, InsertPathDataToDB +# from ntfs_utils.db_node import InsertNodeDataToDB +from ntfs_utils.db_path import DBPathMain from ntfs_utils.db_user import InsertUserToDB volume_letter = 'Y' @@ -26,10 +26,6 @@ def main(): group_name_list = ["Copier"] InsertGroupToDB(group_name_list) - # 初始化 db_path 表 - scanned_data = ScanVolume(volume_letter) - InsertPathDataToDB(scanned_data) - # 初始化 db_extend_name 表 common_extensions = [ "txt", "log", "csv", "xls", "xlsx", "doc", "docx", @@ -41,8 +37,11 @@ def main(): count = InsertExtensionsToDB(common_extensions) print(f"共插入 {count} 个新扩展名。") + # 初始化 db_path 表 + DBPathMain(volume_letter=volume_letter) + # 初始化 db_node 表 - InsertNodeDataToDB(volume_letter) + # InsertNodeDataToDB(volume_letter) if __name__ == '__main__':