optimize db_path memory

This commit is contained in:
Burgess Leo
2025-05-23 18:01:42 +08:00
parent d4a411ce68
commit 491685e892
5 changed files with 48 additions and 46 deletions

View File

@@ -25,10 +25,10 @@ def ClearTableRecordsWithReset(db_path, table_name):
if __name__ == '__main__': if __name__ == '__main__':
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_path') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_path')
ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_node') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_node')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_device')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_config')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_user')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_group')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_extent')
# ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name') ClearTableRecordsWithReset(db_path='../src/db_ntfs_info.db', table_name='db_extend_name')

View File

@@ -0,0 +1,14 @@
import sqlite3
db_path = "../src/db_ntfs_info.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
def GetDeviceId(cursor: sqlite3.Cursor) -> int:
cursor.execute("SELECT ID FROM db_device ORDER BY ID LIMIT 1")
result = cursor.fetchone()
return result[0] if result else 0
print(GetDeviceId(cursor))

View File

@@ -4,6 +4,7 @@ import sqlite3
from datetime import datetime from datetime import datetime
from ntfs_utils.mft_analyze import GetFile80hPattern, GetFragmentData, ExtractSequenceHexValues, hex_list_to_int from ntfs_utils.mft_analyze import GetFile80hPattern, GetFragmentData, ExtractSequenceHexValues, hex_list_to_int
from ntfs_utils.main import volume_letter
# 工具函数:获取文件扩展名 # 工具函数:获取文件扩展名
@@ -131,11 +132,10 @@ def GetFragmentLength(fragment):
# 主函数:将 db_path 数据导入 db_node # 主函数:将 db_path 数据导入 db_node
def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20): def InsertNodeDataToDB(db_path='../src/db_ntfs_info.db', table_name='db_node', batch_size=20):
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path)
cursor = conn.cursor() cursor = conn.cursor()
volume_letter = volume_letter.upper().strip()
if len(volume_letter) == 1: if len(volume_letter) == 1:
volume_root = f"{volume_letter}:\\" volume_root = f"{volume_letter}:\\"
elif volume_letter.endswith(':'): elif volume_letter.endswith(':'):
@@ -254,5 +254,4 @@ def InsertNodeDataToDB(volume_letter: str, db_path='../src/db_ntfs_info.db', tab
if __name__ == "__main__": if __name__ == "__main__":
volume_letter_test = "Y" InsertNodeDataToDB()
InsertNodeDataToDB(volume_letter=volume_letter_test)

View File

@@ -23,7 +23,7 @@ def ShouldSkipPath(path: str) -> bool:
return False return False
def ScanVolume(volume_letter: str): def ScanVolume(volume_letter: str) -> list:
""" """
完整扫描指定磁盘的所有文件和目录,忽略 NTFS 元文件和系统文件夹, 完整扫描指定磁盘的所有文件和目录,忽略 NTFS 元文件和系统文件夹,
并为每个节点分配 ParentID。 并为每个节点分配 ParentID。
@@ -33,7 +33,6 @@ def ScanVolume(volume_letter: str):
if not os.path.exists(root_path): if not os.path.exists(root_path):
raise ValueError(f"磁盘 {root_path} 不存在") raise ValueError(f"磁盘 {root_path} 不存在")
result = []
path_to_id = {} # 路径 -> ID 映射 path_to_id = {} # 路径 -> ID 映射
counter = 1 counter = 1
@@ -61,15 +60,12 @@ def ScanVolume(volume_letter: str):
name = entry name = entry
# 分离盘符并去除开头和结尾的 \ # 分离盘符并处理路径格式
_, relative_path = os.path.splitdrive(full_path) _, relative_path = os.path.splitdrive(full_path)
relative_path = relative_path.lstrip("\\").rstrip("\\") relative_path = relative_path.lstrip("\\").rstrip("\\")
# 如果是目录,结尾加 /
if os.path.isdir(full_path) and not relative_path.endswith("/"): if os.path.isdir(full_path) and not relative_path.endswith("/"):
relative_path += "/" relative_path += "/"
# 替换所有 \ -> /
relative_path = relative_path.replace("\\", "/") relative_path = relative_path.replace("\\", "/")
path_hash = GenerateHash(relative_path) path_hash = GenerateHash(relative_path)
@@ -97,25 +93,27 @@ def ScanVolume(volume_letter: str):
"ContentSize": content_size "ContentSize": content_size
} }
result.append(item) yield item # 使用 yield 返回每条记录
path_to_id[relative_path] = counter path_to_id[relative_path] = counter
counter += 1 counter += 1
except Exception as e: except Exception as e:
print(f"⚠️ 跳过路径 {full_path},错误: {e}") print(f"⚠️ 跳过路径 {full_path},错误: {e}")
return result
def InsertPathDataToDB(data_generator, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20):
def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_path', batch_size=20):
""" """
批量将扫描结果写入数据库。 流式写入数据库,边扫描边入库。
:param data_generator: 可迭代对象(如生成器)
:param db_path: 数据库路径
:param table_name: 表名
:param batch_size: 每多少条记录提交一次
""" """
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path)
cursor = conn.cursor() cursor = conn.cursor()
try: try:
# 创建表(如果不存在)
create_table_sql = f""" create_table_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name} ( CREATE TABLE IF NOT EXISTS {table_name} (
ID INTEGER PRIMARY KEY AUTOINCREMENT, ID INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -125,23 +123,20 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa
IsDir INTEGER NOT NULL CHECK(IsDir IN (0, 1)), IsDir INTEGER NOT NULL CHECK(IsDir IN (0, 1)),
ParentID INTEGER, ParentID INTEGER,
ContentSize INTEGER, ContentSize INTEGER,
FOREIGN KEY(ParentID) REFERENCES {table_name}(ID) FOREIGN KEY(ParentID) REFERENCES {table_name}(ID)
); );
""" """
cursor.execute(create_table_sql) cursor.execute(create_table_sql)
# 插入语句(忽略重复 PathHash
insert_sql = f""" insert_sql = f"""
INSERT OR IGNORE INTO {table_name} INSERT OR IGNORE INTO {table_name}
(Path, Name, PathHash, IsDir, ParentID, ContentSize) (Path, Name, PathHash, IsDir, ParentID, ContentSize)
VALUES (?, ?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?, ?)
""" """
total_inserted = 0
batch = [] batch = []
for item in data: for item in data_generator:
batch.append(( batch.append((
item['Path'], item['Path'],
item['Name'], item['Name'],
@@ -154,39 +149,34 @@ def InsertPathDataToDB(data, db_path='../src/db_ntfs_info.db', table_name='db_pa
if len(batch) >= batch_size: if len(batch) >= batch_size:
cursor.executemany(insert_sql, batch) cursor.executemany(insert_sql, batch)
conn.commit() conn.commit()
total_inserted += cursor.rowcount
print(f"✅ 提交一批 {len(batch)} 条数据") print(f"✅ 提交一批 {len(batch)} 条数据")
batch.clear() batch.clear()
# 插入剩余数据 # 提交剩余不足一批的数据
if batch: if batch:
cursor.executemany(insert_sql, batch) cursor.executemany(insert_sql, batch)
conn.commit() conn.commit()
total_inserted += cursor.rowcount
print(f"✅ 提交最后一批 {len(batch)} 条数据") print(f"✅ 提交最后一批 {len(batch)} 条数据")
print(f"✅ 总共插入 {total_inserted} 条记录到数据库。")
except Exception as e: except Exception as e:
print(f"❌ 插入失败: {e}") print(f"❌ 插入失败: {e}")
conn.rollback() conn.rollback()
finally: finally:
conn.close() conn.close()
# 示例主函数 # 示例主函数
def main(): def DBPathMain(volume_letter: str):
volume_letter = "Y"
print(f"🔍 开始全盘扫描磁盘 {volume_letter}:\\ ...") print(f"🔍 开始全盘扫描磁盘 {volume_letter}:\\ ...")
scanned_data = ScanVolume(volume_letter)
print(f"📊 共扫描到 {len(scanned_data)} 条有效记录,开始入库...") # 获取生成器对象
InsertPathDataToDB(scanned_data) generator = ScanVolume(volume_letter)
print(f"📊 开始逐批入库...")
InsertPathDataToDB(generator)
print("✅ 全盘扫描与入库完成") print("✅ 全盘扫描与入库完成")
if __name__ == "__main__": if __name__ == "__main__":
main() DBPathMain(volume_letter="Y")

View File

@@ -2,8 +2,8 @@ from ntfs_utils.db_config import GetNTFSBootInfo, InsertInfoToDBConfig
from ntfs_utils.db_device import ScanSpecialVolumes, InsertVolumesToDB from ntfs_utils.db_device import ScanSpecialVolumes, InsertVolumesToDB
from ntfs_utils.db_extend_name import InsertExtensionsToDB from ntfs_utils.db_extend_name import InsertExtensionsToDB
from ntfs_utils.db_group import InsertGroupToDB from ntfs_utils.db_group import InsertGroupToDB
from ntfs_utils.db_node import InsertNodeDataToDB # from ntfs_utils.db_node import InsertNodeDataToDB
from ntfs_utils.db_path import ScanVolume, InsertPathDataToDB from ntfs_utils.db_path import DBPathMain
from ntfs_utils.db_user import InsertUserToDB from ntfs_utils.db_user import InsertUserToDB
volume_letter = 'Y' volume_letter = 'Y'
@@ -26,10 +26,6 @@ def main():
group_name_list = ["Copier"] group_name_list = ["Copier"]
InsertGroupToDB(group_name_list) InsertGroupToDB(group_name_list)
# 初始化 db_path 表
scanned_data = ScanVolume(volume_letter)
InsertPathDataToDB(scanned_data)
# 初始化 db_extend_name 表 # 初始化 db_extend_name 表
common_extensions = [ common_extensions = [
"txt", "log", "csv", "xls", "xlsx", "doc", "docx", "txt", "log", "csv", "xls", "xlsx", "doc", "docx",
@@ -41,8 +37,11 @@ def main():
count = InsertExtensionsToDB(common_extensions) count = InsertExtensionsToDB(common_extensions)
print(f"共插入 {count} 个新扩展名。") print(f"共插入 {count} 个新扩展名。")
# 初始化 db_path 表
DBPathMain(volume_letter=volume_letter)
# 初始化 db_node 表 # 初始化 db_node 表
InsertNodeDataToDB(volume_letter) # InsertNodeDataToDB(volume_letter)
if __name__ == '__main__': if __name__ == '__main__':