【python练习】正则表达式练习

import re
def process(input_data):
    """
    将同时有0号和33的用户IMSI和MSISDN提取出来
    :param input_data:  用户信息文件
样例
<SUBBEGIN
	IMSI=1243560615528273;
	MSISDN=986768559232;
	VLRLIST=10;
	CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO;
	OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
	OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
	OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
	OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;
	CHARGE_GLOBAL=3;
<SUBEND
    :return: 号码信息列表,每个格式如示例:IMSI=1243560615528273;MSISDN=986768559232
    """

   # 对文件进行处理,返回可读取的列表
    f = open(input_data)
    message = f.readlines()
    f.close()
    list = []
    for i in range(len(message)):
        # list.append(message[i]) # ['<SUBBEGIN
', '	IMSI=1243560615528273;
', '	MSISDN=986768559232;
',
        list.append(message[i].strip('	').strip('
')) # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;',
    # print(list)
    # print(len(list))

    # 对列表进行拆分,获取子列表的索引
    start_index = []
    stop_index = []

    for i in range(len(list)):
        if list[i] == "<SUBBEGIN":
            start_index.append(i)
        elif list[i] == "<SUBEND":
            stop_index.append(i)
    # print(start_index) # [0, 11, 22, 33,...
    # print(stop_index) # [10, 21, 32, 43...
    # print(len(start_index)) # 1067

    # 重组新列表 new_string
    result = []
    for i in range(len(start_index)):
        new_list = []   # 每次重组列表重新生成
# 通过切片实现代替for循环
new_list = list[start_index[i]:stop_index[i]] for j in range(start_index[i], stop_index[i]): new_list.append(list[j]) new_string = ''.join(new_list) # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串 print(new_string) # 转换为字符串 ''' <SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10; CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO; OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1; OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3; ''' # 正则表达式进行匹配 apn_33 = re.findall('OPTGPRS=d-33', new_string) apn_0 = re.findall('OPTGPRS=d-0', new_string) if len(apn_33) > 0 and len(apn_0) > 0: content = re.findall('IMSI=.+MSISDN=d+', new_string) # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232'] r = ''.join(content) # 将列表结果转换为字符串 'IMSI=1243560615528273;MSISDN=986768559232' result.append(r) # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232'] else: pass return result if __name__ == '__main__': process('input_data.txt')

2、代码优化通过类实现

import re

class apnInfoFinder():
    def __init__(self, input_data):
        self.file = input_data
        self.msglist = []
        self.start_index = []
        self.stop_index = []
        self.result = []

    # 方法1: 对文件进行处理,返回可读取的列表
    def getMsgList(self):
        with open(self.file) as f:
            message = f.readlines()
            for i in range(len(message)):
                self.msglist.append(message[i].strip('	').strip('
'))  # ['<SUBBEGIN', 'IMSI=1243560615528273;', 'MSISDN=986768559232;',
            return self.msglist

    # 方法2  对列表进行拆分,获取子列表的起始索引
    def getNewList(self, list):
        for i in range(len(list)):
            if list[i] == "<SUBBEGIN":
                self.start_index.append(i)
            elif list[i] == "<SUBEND":
                self.stop_index.append(i)
        return self.start_index, self.stop_index

    # 方法3: 重组新列表并进行匹配查找
    def getFinder(self, lenlist, list):
        for i in range(len(lenlist)):
            new_list = []  # 每次重组列表重新生成
            for j in range(self.start_index[i], self.stop_index[i]):
                new_list.append(list[j])
            new_string = ''.join(new_list)  # 以指定字符串作为分隔符,将 seq 中所有的元素(的字符串表示)合并为一个新的字符串
            # print(new_string)  # 转换为字符串
            '''
            <SUBBEGINIMSI=1243560615528273;MSISDN=986768559232;VLRLIST=10;
            CF=CFD-TS10-REG-91986762386238-YES-NO-20-YES-65535-YES-YES-NO-NO-NO-YES-YES-YES-YES-NO;
            OPTGPRS=3-33-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
            OPTGPRS=2-2-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
            OPTGPRS=1-0-504-241-33-NONE-0-NONE-00000000-65535-0-0-PS_APN-NONE-65535-1;
            OPTGPRS=4-33-65535-241-33-NONE-0-3-00000000-25-1-0-EPS_APN-NONE-65535-1;CHARGE_GLOBAL=3;
            '''
            # 正则表达式进行匹配
            apn_33 = re.findall('OPTGPRS=d-33', new_string)
            apn_0 = re.findall('OPTGPRS=d-0', new_string)
            if len(apn_33) > 0 and len(apn_0) > 0:
                content = re.findall('IMSI=.+MSISDN=d+',
                                     new_string)  # 正则表达式 返回列表 ['IMSI=1243560615528273;MSISDN=986768559232']
                r = ''.join(content)  # 将列表结果转换为字符串  'IMSI=1243560615528273;MSISDN=986768559232'
                self.result.append(r)  # 将字符串写入result中 ['IMSI=1243560615528273;MSISDN=986768559232']
            else:
                pass
        return self.result

    # 方法4: 结果输出为excel文件
    def outPut(self):
        self.getMsgList()
        self.getNewList(self.msglist)
        self.getFinder(self.start_index, self.msglist)
        with open('output.csv', 'w') as out_result:
            for line in self.result:
                out_result.writelines(line + '
')

if __name__ == '__main__':
    a = apnInfoFinder('input_data.txt')
    a.outPut()
原文地址:https://www.cnblogs.com/zhaoyujiao/p/15429481.html