2022年 11月 7日

python读取文本中的英文_验证英文文本Python中“a”和“an”的正确使用

这里有一个解决方案,其中正确性定义为:an出现在以元音开头的单词之前,否则a可用于:#!/usr/bin/env python

import itertools

import re

import sys

try:

from future_builtins import map, zip

except ImportError: # Python 3 (or old Python versions)

map, zip = map, zip

from operator import methodcaller

import nltk # $ pip install nltk

from nltk.corpus import cmudict # >>> nltk.download(‘cmudict’)

def starts_with_vowel_sound(word, pronunciations=cmudict.dict()):

for syllables in pronunciations.get(word, []):

return syllables[0][-1].isdigit() # use only the first one

def check_a_an_usage(words):

# iterate over words pairwise (recipe from itertools)

#note: ignore Unicode case-folding (`.casefold()`)

a, b = itertools.tee(map(methodcaller(‘lower’), words))

next(b, None)

for a, w in zip(a, b):

if (a == ‘a’ or a == ‘an’) and re.match(‘\w+$’, w):

valid = (a == ‘an’) if starts_with_vowel_sound(w) else (a == ‘a’)

yield valid, a, w

#note: you could use nltk to split text in paragraphs,sentences, words

pairs = ((a, w)

for sentence in sys.stdin.readlines() if sentence.strip()

for valid, a, w in check_a_an_usage(nltk.wordpunct_tokenize(sentence))

if not valid)

print(“Invalid indefinite article usage:”)

print(‘\n’.join(map(” “.join, pairs)))

示例输入(每行一句话)

^{pr2}$

输出Invalid indefinite article usage:

a acre

an rhythm

an yearly