普通文本  |  62行  |  1.99 KB

import re,string


class reason_counter:
    def __init__(self, wording):
        self.wording = wording
        self.num = 1

    def update(self, new_wording):
        self.num += 1
        self.wording = new_wording

    def html(self):
        if self.num == 1:
            return self.wording
        else:
            return "%s (%d+)" % (self.wording, self.num)


def numbers_are_irrelevant(txt):
    ## ? when do we replace numbers with NN ?
    ## By default is always, but
    ## if/when some categories of reasons choose to keep their numbers,
    ## then the function shall return False for such categories
    return True


def aggregate_reason_fields(reasons_list):
    # each reason in the list may be a combination
    # of | - separated reasons.
    # expand into list
    reasons_txt = '|'.join(reasons_list)
    reasons = reasons_txt.split('|')
    reason_htable = {}
    for reason in reasons:
        reason_reduced = reason.strip()
        ## reduce whitespaces
        reason_reduced = re.sub(r"\s+"," ", reason_reduced)

        if reason_reduced == '':
            continue # ignore empty reasons

        if numbers_are_irrelevant(reason_reduced):
            # reduce numbers included into reason descriptor
            # by replacing them with generic NN
            reason_reduced = re.sub(r"\d+","NN", reason_reduced)

        if not reason_reduced in reason_htable:
            reason_htable[reason_reduced] = reason_counter(reason)
        else:
            ## reason_counter keeps original ( non reduced )
            ## reason if it occured once
            ## if reason occured more then once, reason_counter
            ## will keep it in reduced/generalized form
            reason_htable[reason_reduced].update(reason_reduced)

    generic_reasons = reason_htable.keys()
    generic_reasons.sort(key = (lambda k: reason_htable[k].num),
                         reverse = True)
    return map(lambda generic_reason: reason_htable[generic_reason].html(),
                            generic_reasons)