Pandas_数据读取与存储数据（精炼）

# 一，读取 CSV 文件：
# 文字解析函数：
# pd.read_csv()    从文件中加载带分隔符的数据，默认分隔符为逗号
# pd.read_table()  从文件中加载带分隔符的数据，默认分隔符为制表符

# read()_csv/read_table()参数：
# path        文件路径，路径中如果有空格需要先使用 open函数获取文件对象
# sep         文段隔开的字符序列，也可使用正则表达式
# header      指定行标题（指定列索引），默认为0，也可以设为 None
# index_col   用于行索引的列名或列编号
# names       指定列索引的列名
# skiprows    需要忽略的行数（从文件开始处算）
# nrows       需要读取的行数（从文件开始处算）
# chunksize   文件块的大小
# usecols     指定读取的列号或列名

import pandas as pd
file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	est01.csv',"w",newline='',encoding='utf-8')
writer = csv.writer(file_obj)
writer.writerow(('id','name','grade'))
writer.writerow(('1','lucy','90'))
writer.writerow(('2','tom','88'))
writer.writerow(('3','Collin','99'))
file_obj.close()

# !type C:UsersXuYunPengDesktoppython	est01.csv   (只适用于 windows,且文件路径中不能存在空格)

# 1, pd.read_csv()读取文件内容：
# 1)路径包含空格时：
file_obj=open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	est01.csv')
df=pd.read_csv(file_obj)
file_obj.close()
df
# 2)路径不包含空格时：
df=pd.read_csv(r'C:UsersXuYunPengDesktoppython	est01.csv')
df

# 2, pd.read_table()读取文件内容：
# 比 pd.read_csv()多一个 sep=','参数，其他一样。

# 3, 指定列标签作为行索引替代默认索引  index_clo='id'
df=pd.read_csv(r'C:UsersXuYunPengDesktoppython	est01.csv',index_col='id')  
# 等同于：
df=pd.read_csv(r'C:UsersXuYunPengDesktoppython	est01.csv')  
df=df.set_index('id')

# 4, 指定层次化列索引  index_clo=[0,'id'] index_clo=[0,1]  index_clo=['shool','id']...传入列表或列元素组成的列名
# 创建一个 csv文件：
file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	est02.csv',"w",newline='',encoding='utf-8')
writer = csv.writer(file_obj)
writer.writerow(('school','id','name','grade'))
writer.writerow(('sh01','1','lucy','90'))
writer.writerow(('sh01','2','tom','88'))
writer.writerow(('sh01','3','Collin','99'))
writer.writerow(('sh02','1','ppp','90'))
writer.writerow(('sh02','2','aaa','88'))
writer.writerow(('sh02','3','sss','99'))
writer.writerow(('sh03','1','hhh','90'))
writer.writerow(('sh03','2','jjj','88'))
writer.writerow(('sh03','3','mmm','99'))
file_obj.close()

file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	est02.csv')
df=pd.read_csv(file_obj,index_col=['school','id'])   # 先 read
file_obj.close()                                     # 后 close
df


# 5, 标题行设置   
# header=None(缺省时默认) 或 names=['id','name','grade']   
# 使用pd.read_csv()或 pd.read_table() 默认会把第一行作为标题行。
# 当一些 csv文件没有标题行时,使用 header参数设置标题行为空，或者 names参数设定指定的标题。
file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	est03.csv')
df=pd.read_csv(file_obj,names=['id','name','grade'])   # 先 read
file_obj.close()                                       # 后 close
df

# 6, 读取部分行或列  读取一定数量的行数或列数   
df=pd.read_csv(file_obj,skiprows=[0,5],nrows=10,usecols=['name','grade']) 
df=pd.read_csv(file_obj,skiprows=[0,5],nrows=10,usecols=[1,2]) 

# 7, 指定文件块的大小    chunksize=100   
# 在读取大文件时，需要对文件进行逐块读取。
# step1:通过 df=pd.read(path), df.info()函数查看数据，能获取数据的行索引，各列数据情况
# step2:通过 chunker=pd.read(path,chunksize=100) 获取可迭代对象 chunker。
# step3:使用 for in 循环遍历 chunker。

# 示例：
# step1:通过 df=pd.read(path), df.info()函数查看数据，能获取数据的行索引，各列数据情况
import pandas as pd
from pandas import Series,DataFrame
file_obj=open(r"C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	rain.csv")
df=pd.read_csv(file_obj)
df.info()
# step2:通过 chunker=pd.read(path,chunksize=100) 获取可迭代对象 chunker。
# chunker=pd.read_csv(open(r"C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	rain.csv"),chunksize=100) 
file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	rain.csv')
chunker=pd.read_csv(file_obj,chunksize=100)  
# step3:使用 for in 循环遍历 chunker:
# se.add(se.value_counts,fill_value=0)

sex=Series([],dtype='float64') # 准备一个空序列。
for i in chunker:   # i 为 行数为 100 的 DataFrame 块.
#     print(i)
#     sex=sex+i['Sex'].value_counts()  # 无法处理 缺失值
    sex=sex.add(i['Sex'].value_counts(),fill_value=0)   # 统计 sex列中 male 和 female的个数，遇到缺失元素，用 0 替代。
       
file_obj.close()   # 放在最后，否则报错。 
sex


# 二，读取 txt 文件： pd.read_table()

# txt 文件的分隔符可能不是逗号（通常由不定数量的空格作为分隔），这里我们创建一个txt文件，并以 ？ 作为分隔符：
# 创建一个 txt 文件：和创建 csv 不同的地方：
# 1）不需要使用 writer = csv.writer(file_obj) 创建 writer对象
# 2）写入数据的函数名称及调用函数的对象也不一样：写入csv 是 writer.writerow(); 写入 txt 是 file_obj.writelines()

# 示例 1：
import pandas as pd
file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	estfortxt01.txt','w')
file_obj.writelines('id?name?grade'+'
')
file_obj.writelines('1?lucy?80'+'
')
file_obj.writelines('2?tom?85'+'
')
file_obj.writelines('3?lili?85'+'
')
file_obj.close()

file_obj = open(r'C:UsersXuYunPengPycharmProjectsPython基础知识大全第10章 Numpy库的使用	estfortxt01.txt','r')
df=pd.read_table(file_obj,sep="?")
file_obj.close()
df

# 示例 2：
# 由于很多 txt 文件并没有特定的分隔符，而是一些数量不定的空白符进行分隔，如：

import pandas as pd
file_obj = open(r'C:UsersXuYunPengDesktoppython	estfortxt02.txt','w')
file_obj.writelines('id   name grade'+'
')
file_obj.writelines('1 lucy  80'+'
')
file_obj.writelines('2 tom 85'+'
')
file_obj.writelines('3  lili   85'+'
')
file_obj.close()

!type C:UsersXuYunPengDesktoppython	estfortxt02.txt

# 使用正则表达式来处理： sep='s+'
file_obj = open(r'C:UsersXuYunPengDesktoppython	estfortxt02.txt','r')
df=pd.read_table(file_obj,sep='s+')
df


# 三, 存储 csv 和 txt 文件：
# to_csv()是 DataFrame类的方法，read_csv()是pandas的方法

# 对数据进行处理分析后，往往要把处理结果存储起来：
# 语法及默认参数：
# df.to_csv(path_or_buf=None, sep=', ', na_rep='', float_format=None, columns=None, 
# header=True, index=True, index_label=None, mode='w', encoding=None, compression=None, 
# quoting=None, quotechar='"', line_terminator='
', chunksize=None, tupleize_cols=None, 
# date_format=None, doublequote=True, escapechar=None, decimal='.')

# 对于 index参数： False值    不在csv文件里新建索引
#                 index缺省  在csv的第一列里新建 从0开始的索引。（默认）
# 对于 header参数：header=None,不保留 DataFrame里的 header(即首行)了。
#                 header=True或缺省,保留 DataFrame的header(即首行)
#                 header=[header名组成的列表]，不保留 DataFrame里的 header(即首行)了，使用新建的 headers
# 对于 sep参数:   ','          将按照 DataFrame的列，对应写道 csv的各列中
#               其他分隔符     不分列，都写到 csv的第一列中


    
# 四，JSON 数据的读取与存储
# javascript object notation  简洁清晰，轻量级的数据交换格式，多用于 web 数据交换

# 1，读取 JSON数据：有两种方法读取：推荐方法 2
# 1.1）使用 JSON 库，将数据转化为 字符串格式。略
# 1.2)，使用 pandas的read_json()函数来读取 JSON数据：
import pandas as pd
df=pd.read_json(r"C:UsersXuYunPengDesktoppythoneueo2012.json")
df=df.sort_index()  #由于数据结构与字典相似，因此是无序的，每次读取结果行数据的顺序可能不同。这里对行索引重新排序。
df

# 2,存储 DataFrame数据到 json文件：DataFrame_obj.to_json(path)
df.to_json(r"C:UsersXuYunPengDesktoppythoneueo2012_output.json")
# !type C:UsersXuYunPengDesktoppythoneueo2012_output.json



# 五，Excel 数据的读取与存储

# 1,创建一个 excel文件并输入一些数据：
import xlwings as xw
app = xw.App(visible=False, add_book=False)  # 设置程序不可见运行
wb = app.books.add()  
ws = wb.sheets.active
arr = [['id','name','grade'],[1,'lucy','88'],[2,'tom','89'],[3,'collin','90']]
ws.range('A1').value=arr  # 或 # ws.range('A1:B4').value=arr 
wb.save(r"C:UsersXuYunPengDesktoppythonexcel_test.xlsx")
wb.close()
app.quit()
exit()
# 2,读取 excel数据：pd.read_excel():
import pandas as pd
df=pd.read_excel(r"C:UsersXuYunPengDesktoppythonexcel_test.xlsx",sheet_name='Sheet1')
df

# 3,覆盖存储 DataFrame数据到 excel文件：
df.to_excel(r"C:UsersXuYunPengDesktoppythonexcel_test.xlsx",sheet_name='output',index=False,startcol=0,startrow=0)

# 4,如果不想覆盖原有数据，可以借助csv文件作为中间载体，因为  to_csv()函数里有一个 mode='a'的参数。
import os 
df.to_csv(r"C:UsersXuYunPengDesktoppythonexcel_test.csv",index=False)
df.to_csv(r"C:UsersXuYunPengDesktoppythonexcel_test.csv",index=False,header=None,mode='a+')
df=pd.read_csv(r"C:UsersXuYunPengDesktoppythonexcel_test.csv")
# file_obj=open(r'C:UsersXuYunPengDesktoppythonexcel_test.csv')
# df=pd.read_csv(file_obj)
# file_obj.close()
df.to_excel(r"C:UsersXuYunPengDesktoppythonexcel_test.xlsx",index=False,encoding='GBK',sheet_name='Sheet1')
os.remove(r"C:UsersXuYunPengDesktoppythonexcel_test.csv")  # 删除中间载体文件
# !type C:UsersXuYunPengDesktoppythonexcel_test.xlsx

# 5,将数据写入已有数据的 excel文件中的新的 sheet里。（sheet_name如果在文件中已经存在会新建一个sheet）
import pandas as pd
import openpyxl
writer = pd.ExcelWriter(r'C:UsersXuYunPengDesktoppythonexcel_test.xlsx',engine='openpyxl')
writer.book = openpyxl.load_workbook(writer.path)  # 此句如果缺少，后面语句执行时会覆盖文件里原有内容。
df.to_excel(excel_writer=writer,sheet_name="Sheet1",index=False)  # 在 sheet1里增加内容
# df.to_excel(excel_writer=writer,sheet_name="Sheet2",index=False)  # 新建 sheet2
writer.save()
writer.close()

# r : 只能读, 必须存在, 可在任意位置读取
# w : 只能写, 可以不存在, 必会擦掉原有内容从头写
# a : 只能写, 可以不存在, 必不能修改原有内容, 只能在结尾追加写, 文件指针无效
# r+ : 可读可写, 必须存在, 可在任意位置读写, 读与写共用同一个指针
# w+ : 可读可写, 可以不存在, 必会擦掉原有内容从头写
# a+ : 可读可写, 可以不存在, 必不能修改原有内容, 只能在结尾追加写, 文件指针只对读有效 (写操作会将文件指针移动到文件尾)


# 六, 一道练习题：

# 创建一个 csv文件，包含'CNUM'和'COMPANY'两列，创建包含空行的，且有内容重复的行数据。
# 然后处理数据：去掉空行，重复行数据只保留一行有效数据，修改'COMPANY'列的名称为'公司',并在其后增加六列，
# 分别为'C_col','D_col','E_col','F_col','G_col','H_col'。

import pandas as pd
import numpy as np
import csv
from pandas import DataFrame,Series

# step 1: 创建含数据的文件：

file_obj = open(r'C:UsersXuYunPengDesktoppythonCNUM_COMPANY.csv',"w",newline='',encoding='utf-8')
writer = csv.writer(file_obj)
writer.writerow(('CNUM','COMPANY'))
writer.writerow(('1001','IBMA'))
writer.writerow(('1002','IBMA'))
writer.writerow(('1003','IBMA'))
writer.writerow(('1001','IBMA'))
writer.writerow(('',''))  # 添加空行
writer.writerow(('1002','IBMB'))
writer.writerow(('1003','IBMC'))
writer.writerow(('1001','IBMB'))
writer.writerow(('1002','IBMA'))
writer.writerow(('1003','IBMC'))
writer.writerow(('',''))  # 添加空行
writer.writerow(('1005','IBMA'))
writer.writerow(('1003','IBMH'))
writer.writerow(('1006','IBMD'))
writer.writerow(('1007','IBMF'))
writer.writerow(('1008','IBMA'))
file_obj.close()
file_obj.close()

# 查看文件内容：
# !type C:UsersXuYunPengDesktoppythonCNUM_COMPANY.csv

# step 2: 处理数据：
file_obj=open(r'C:UsersXuYunPengDesktoppythonCNUM_COMPANY.csv')
df=pd.read_csv(file_obj)                 # 创建 DataFrame
df=df.reindex(columns=['CNUM','COMPANY','C_col','D_col','E_col','F_col','G_col','H_col'],fill_value=None)   # 重新指定列索引
df.rename(columns={'COMPANY':'公司'}, inplace = True)  # 修改列名
df=df.dropna(axis=0,how='all')           # 去除 NAN 即文件中的空行
df['CNUM'] = df['CNUM'].astype('int32')  # 将 CNUM 列的数据类型指定为 int32
df = df.drop_duplicates(subset=['CNUM', '公司'], keep='first')  # 去除重复行
# 一些没有学到过的函数，会在后面的博客中继续更新。

# step 3: 保存数据：
df.to_csv(r'C:UsersXuYunPengDesktoppythonCNUM_COMPANY_OUTPUT.csv',index=False,encoding='GBK')
file_obj.close()

# 查看文件内容：
!type C:UsersXuYunPengDesktoppythonCNUM_COMPANY_OUTPUT.csv