import xml.etree.ElementTree as ET import zipfile from pathlib import Path import gdown
defget_data( url: str, zip_path: str, raw_train_path: str, raw_test_path: str, processed_train_path: str, processed_test_path: str, ): # Download data from Google Drive zip_path = "Twitter.zip" gdown.download(url, zip_path, quiet=False)
# Unzip data with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(".")
# Extract texts from files in the train directory t_train = [] for file_path in Path(raw_train_path).glob("*.xml"): list_train_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] train_doc_1 = " ".join(t for t in list_train_doc_1) t_train.append(train_doc_1) t_train_docs = " ".join(t_train)
# Extract texts from files in the test directory t_test = [] for file_path in Path(raw_test_path).glob("*.xml"): list_test_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] test_doc_1 = " ".join(t for t in list_test_doc_1) t_test.append(test_doc_1) t_test_docs = " ".join(t_test)
# Write processed data to a train file with open(processed_train_path, "w") as f: f.write(t_train_docs)
# Write processed data to a test file with open(processed_test_path, "w") as f: f.write(t_test_docs)
defget_raw_data(url: str, zip_path: str) -> None: gdown.download(url, zip_path, quiet=False) with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(".")
函数get_raw_data只执行一个动作,那就是获取原始数据。
重复性
我们应该避免重复,因为:
重复的代码削弱了代码的可读性。
重复的代码使代码修改更加复杂。如果需要修改,需要在多个地方进行修改,增加了出错的可能性。
下面的代码包含重复的内容,用于检索训练和测试数据的代码几乎是相同的。
from pathlib import Path
# 从train目录下的文件中提取文本 t_train = [] for file_path in Path(raw_train_path).glob("*.xml"): list_train_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] train_doc_1 = " ".join(t for t in list_train_doc_1) t_train.append(train_doc_1) t_train_docs = " ".join(t_train)
# 从测试目录的文件中提取文本 t_test = [] for file_path in Path(raw_test_path).glob("*.xml"): list_test_doc_1 = [r.text for r in ET.parse(file_path).getroot()[0]] test_doc_1 = " ".join(t for t in list_test_doc_1) t_test.append(test_doc_1) t_test_docs = " ".join(t_test)
all_docs = [] for file_path in Path(folder_path).glob("*.xml"): list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] text_in_one_file = " ".join(list_of_text_in_one_file) all_docs.append(text_in_one_file)
all_docs = [] for file_path in Path(folder_path).glob("*.xml"): list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] text_in_one_file = " ".join(list_of_text_in_one_file) all_docs.append(text_in_one_file)
return" ".join(all_docs)
该函数本身处于较高层次,但 for 循环内的代码涉及与XML解析、文本提取和字符串操作有关的较低层次的操作。
defextract_texts_from_multiple_files(folder_path: str) -> str: all_docs = [] for file_path in Path(folder_path).glob("*.xml"): text_in_one_file = extract_texts_from_each_file(file_path) all_docs.append(text_in_one_file)
return" ".join(all_docs)
defextract_texts_from_each_file(file_path: str) -> str: list_of_text_in_one_file = [r.text for r in ET.parse(file_path).getroot()[0]] return" ".join(list_of_text_in_one_file)