Python正则表达式

img


读取ge_wordlist.txt文件,将含有非字母非数字的单词抽取出来,保存到一个文件中,将其他单词保存到另一个文件中。
将含有连续两个元音字母的单词抽取出来保存到一个文件中

基于new bing的编写参考:
代码中使用 re 模块的正则表达式功能匹配单词,根据题目要求将不同类型的单词写入到不同的文件中。其中,non_alpha_numeric_pattern 匹配非字母非数字的单词,double_vowel_pattern 匹配连续两个元音字母的单词。需要注意的是,在打开文件时需指定所使用的编码方式(这里使用 utf-8)。


import re

input_file_path = "ge_wordlist.txt"
non_alpha_numeric_file_path = "non_alpha_numeric_words.txt"
other_words_file_path = "other_words.txt"
double_vowel_file_path = "double_vowel_words.txt"

# 匹配非字母非数字的单词
non_alpha_numeric_pattern = re.compile(r"^[^a-zA-Z0-9]*([a-zA-Z0-9]+)[^a-zA-Z0-9]*$")

# 匹配连续两个元音字母的单词
double_vowel_pattern = re.compile(r"[aeiouAEIOU]{2}")

# 打开输入文件和输出文件
with open(input_file_path, mode="r", encoding="utf-8") as input_file, \
        open(non_alpha_numeric_file_path, mode="w", encoding="utf-8") as non_alpha_numeric_file, \
        open(other_words_file_path, mode="w", encoding="utf-8") as other_words_file, \
        open(double_vowel_file_path, mode="w", encoding="utf-8") as double_vowel_file:
    for line in input_file:
        # 使用正则表达式匹配单词
        match = non_alpha_numeric_pattern.match(line.strip())
        if match:
            word = match.group(1)
            # 判断单词是否包含非字母非数字字符
            if not word.isalnum():
                non_alpha_numeric_file.write(word + "\n")
            # 判断单词是否包含连续两个元音字母
            elif double_vowel_pattern.search(word):
                double_vowel_file.write(word + "\n")
            else:
                other_words_file.write(word + "\n")