from sklearn.model_selection import train_test_split
复制代码
一,数据预处理
path_to_file = "data/cmn.txt" ## 数据集文件
# 步骤 7 定义预处理函数
def preprocess_eng(w):
w = w.lower().strip()
# 单词都小写,用空格切分
# 单词和标点之间加空格
# eg: "he is a boy." => "he is a boy ." white-spaces-keeping-punctuation # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-