数据分析进阶 数据质量

目的:使用python对excel中的数据进行清洗,根据列进行数据清洗

习题1:

  处理城市 infobox 数据,对数据进行审核,然后想出清理方法并清理数据。在第一道练习中,请审核数据集中某些特定字段中的数据类型。
  值类型可以是:
  1. NoneType,如果值是字符串“NULL”或空字符串“”
  2. 列表,如果值以“{”开头
  3. 整型,如果值可以转型为整型
  4. 浮点型,如果值可以转型为浮点型,但是无法转型为整型。
    例如,“3.23e+07”应该被当做浮点型,因为可以转型为浮点型,但是int('3.23e+07') 将抛出 ValueError
  5. “str”,表示其他所有值  

  

import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]
 
def get_type(value):
#条件1,如果是NULL或者''返回None类型
if value == 'NULL' or value == '': return type(None)
#条件2,如果以{开头,返回list()类型
if value.startswith('{'): return type(list())
#条件3,如果是整形,返回int类型
try: new_value = int(value) return type(new_value) except ValueError: pass
  #条件4,如果是浮点型,返回float类型
try: new_value = float(value) return type(new_value) except ValueError: pass
#条件5,都不满足以上条件,返回str类型
return type(str()) def audit_file(filename,fields):
#存放每一列的类型 fieldtypes
= {} with open(filename,'r') as data_file: reader = csv.DictReader(data_file)
#跳过文件中的前三行
for i in range(3): reader.next() #将给定的列名变成key,value生成一个空set来存放类型 for field in FIELDS: fieldtypes[field] = set() for row in reader:
#循环每一行
for k,v in row.items():
#获取数据的类型 data_type
= get_type(v)
#如果row中的表头在给定的表头定义中
if k in FIELDS:
#如果数据的类型不在字典中,则将该类型加入到字典,并返回
if data_type not in fieldtypes[k]: fieldtypes[k].add(data_type) return fieldtypes
def test():
    #测试函数,不报错即为代码正确
    fieldtypes = audit_file(CITIES, FIELDS)

    pprint.pprint(fieldtypes)

    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
    
if __name__ == "__main__":
    test()

习题2:

  在选择城市面积的值的时候,应该选择数据的精度为第一参考目标

习题3:

  根据上面的结论,修改表格中的areaLand字段,使其符合条件

  

def fix_area(area):
#如果以{开头,表示该字段有多个值,因此要选取精度最高的值返回即可
if area.startswith('{'):
#去掉前后的{},并且以|分割数据 area_land
= area.replace('{','').replace('}','').split('|')
#获取到两个值 area0
= area_land[0] area1 = area_land[1]
#根据数据的长度来判断精度,长的数据精度更高
if len(area0) > len(area1): return float(area0) else: return float(area1)
#表示该字段只有一个值或者没有值
else: try:
#如果有值,直接转换成浮点数,返回即可 area_land
= float(area) return area_land except ValueError:
#如果不存在,返回None area_land
= None return area_land def process_file(filename): # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra metadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to fix the area value if "areaLand" in line: line["areaLand"] = fix_area(line["areaLand"]) data.append(line) return data
def test():
   #测试函数,不报错即为代码正确 data
= process_file(CITIES) print "Printing three example results:" for n in range(5,8): pprint.pprint(data[n]["areaLand"]) assert data[3]["areaLand"] == None assert data[8]["areaLand"] == 55166700.0 assert data[20]["areaLand"] == 14581600.0 assert data[33]["areaLand"] == 20564500.0 if __name__ == "__main__": test()

习题4:

  根据name列,返回所有名称列表。如果只有一个名称,列表将只有一项。如果名称是“NULL”,该列表应该为空

def fix_name(name):
#如果为NULL返回空列表
if name == 'NULL': name = []
#如果以{开头则有多个数据,根据|进行分割,并返回名字的列表
elif name.startswith('{'): name = name.replace('{','').replace('}','').split('|')
#如果有一个值,则返回单元素的列表
else: name = [name] return name def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra metadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to fix the area value if "name" in line: line["name"] = fix_name(line["name"]) data.append(line) return data
def test():
  #测试函数,不报错即为代码正确 data
= process_file(CITIES) print "Printing 20 results:" for n in range(20): pprint.pprint(data[n]["name"]) assert data[14]["name"] == ['Negtemiut', 'Nightmute'] assert data[9]["name"] == ['Pell City Alabama'] assert data[3]["name"] == ['Kumhari'] if __name__ == "__main__": test()

习题5:

  根据传入的point来分割出经度和纬度,和输入的参数的值进行比较,相同返回True,不相同返回False

def check_loc(point,lat,longi):
#按照空格进行分割 point_list
= point.split(' ') _lat = point_list[0] #经度 _longi = point_list[1] #纬度
#如果文件中的值和传入的值相同,返回True,否则返回False
if _lat == lat and _longi == longi: return True else: return False def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra matadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to check the location result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"]) if not result: print "{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"]) data.append(line) return data
def test():
#测试函数,不报错即为代码正确
assert check_loc("33.08 75.28", "33.08", "75.28") == True assert check_loc("44.57833333333333 -91.21833333333333", "44.5783", "-91.2183") == False if __name__ == "__main__": test()
原文地址:https://www.cnblogs.com/luhuajun/p/7908409.html