标准CSV解析

很方便的csv解析,可以解析带有分隔符的字段,可以解析成map方便程序使用。

//
//  CCSVParse.hpp
//  CPPAlgorithm
//
//  Created by xujw on 16/2/26.
//  Copyright © 2016年 xujw. All rights reserved.
//

#ifndef CCSVParse_h
#define CCSVParse_h

#include <stdio.h>
#include <vector>
#include <string>
#include <map>
#include <iostream>
#include <sstream>

/*
 * 全局函数,转换数据类型
 * toInt
 * toFloat
 * toDouble
 */
int conToInt(std::string &source);
float conToFloat(std::string &source);
double conToDouble(std::string &source);
//转成std::string
std::string conToString(int s);
std::string conToString(float s);
std::string conToString(double s);


class CCSVParse
{
public:
    CCSVParse();
    ~CCSVParse();

    /*
     *  解析成map形式(数据要保证第一行为数据类型名,第一列为id名,id不可重复)
     *  使用时通过id获取一行数据(也是map形式),然后通过类型名作为key获取数据
     *  格式如下:
     *   id   name    age
     *   1    小明     20
     *   2    小红     19
     *   3    小光     18
     */
    std::map<std::string, std::map<std::string, std::string> > parseCsvFileToMap(const std::string &fileName,const std::string &separator = ",");

    //解析出行列数据 separtor只能是一个字符(比如 , # 等)
    std::vector< std::vector<std::string> > parseCsvFile(const std::string &fileName,const std::string &separator = ",");

    //打印出解析的数据 测试用
    void printParseData() const;

    inline size_t getRowNum() const {return _gridData.size();};

    inline void useSimpleModel(bool flag){_useSimpleModel = flag;};
    /*
        分隔字符串
        str:要分隔的字符串/文件
        seperator:分隔符
     */
    std::vector<std::string> splitString(const std::string &str,const std::string &sparator);

    std::string loadCsvFile(const std::string &fileName);

private:
    //原始数据
    std::vector< std::vector<std::string> > _gridData;
    bool _useSimpleModel;   //是否使用简单模式
};

#endif /* CCSVParse_h */

//
//  CCSVParse.cpp
//  CPPAlgorithm
//
//  Created by xujw on 16/2/26.
//  Copyright © 2016年 xujw. All rights reserved.
//

#include "CCSVParse.h"
#include <assert.h>

CCSVParse::CCSVParse():_useSimpleModel(false){}
CCSVParse::~CCSVParse(){}

std::vector<std::string> CCSVParse::splitString(const std::string &str, const std::string &separator)
{
    std::vector<std::string> resVec;
    if ("" == str)
    {
        return resVec;
    }
    //方便截取最后一段数据
    std::string dataStr = str + separator;

    size_t pos = dataStr.find(separator);
    size_t size = dataStr.size();

    while (pos != std::string::npos)
    {
        std::string x = dataStr.substr(0,pos);
        resVec.push_back(x);
        dataStr = dataStr.substr(pos+1,size);
        pos = dataStr.find(separator);
    }

    return resVec;

//    //Method 2
//    size_t nStartPosFound = str.find(separator, 0);
//    size_t nFieldStart = 0;
//    for (; nStartPosFound != -1; nStartPosFound = str.find(separator, nStartPosFound))
//    {
//        std::string strSub = str.substr(nFieldStart, nStartPosFound - nFieldStart);
//        nStartPosFound = nStartPosFound + separator.size();
//        nFieldStart = nStartPosFound;
//        
//        
//        resVec.push_back(strSub);
//    }
//    
//    // 加入最后一个字段
//    if (nFieldStart < str.size())
//    {
//        std::string strSub = str.substr(nFieldStart, str.size() - nFieldStart);
//        resVec.push_back(strSub);
//    }
//    return resVec;

}

std::string CCSVParse::loadCsvFile(const std::string &fileName)
{
    FILE *pFile = fopen(fileName.c_str(), "rb");
    if (0 == pFile)
    {
        return "";
    }

    fseek(pFile, 0, SEEK_END);      //指针移动到文件结尾
    long len = ftell(pFile);        //获取文件大小

    char *pBuffer = new char[len+1];

    fseek(pFile, 0, SEEK_SET);      //指针移动到文件开头
    fread(pBuffer, 1, len, pFile);  //读取文件
    fclose(pFile);

    //等价于std::string s;s.assign(pBuffer,len);
    pBuffer[len] = 0;
    std::string strRead(pBuffer,len);

    delete [] pBuffer;

    return strRead;
}

std::vector<std::vector<std::string>> CCSVParse::parseCsvFile(const std::string &fileName,const std::string &separator)
{
    clock_t before = clock();

    _gridData.clear();

    std::string strAllData = loadCsvFile(fileName);

    if (strAllData.size() == 0)
    {
        return _gridData;
    }

    //分隔符只能是一个字符
    assert(separator.size() == 1);

    //简易模式,字段里面不能包含分隔符
    if (_useSimpleModel)
    {
        std::cout<<"使用简易模式解析"<<std::endl;
        //分出行和字段
        std::vector<std::string> ret = splitString(strAllData, "
");
        for (size_t i=0; i<ret.size(); i++)
        {
            std::vector<std::string> rowData = splitString(ret.at(i), separator);
            _gridData.push_back(rowData);
        }
        return _gridData;
    }

    //标准模式,字段里面可以包含分隔符
    //定义状态
    typedef enum stateType
    {
        kNewFieldStart = 0,  //新字段开始
        kNonQuotesField,     //非引号字段
        kQuotesField,        //引号字段
        kFieldSeparator,     //字段分隔
        kQuoteInQuotesField, //引号字段中的引号
        kRowSeparator,       //行分隔符(回车)
        kError               //语法错误
    }StateType;

    //分出行
    std::vector<std::string> vecRows = splitString(strAllData, "
");
    for (int i=0; i<vecRows.size(); i++)
    {
        //一行一行处理
        std::string strRowData = vecRows.at(i);
        if (0 == strRowData.size())
        {
            continue;
        }

        std::vector< std::string > vecFields;
        std::string strField;
        StateType state = kNewFieldStart;
        for (int j=0; j<strRowData.size(); j++)
        {
            const char &ch = strRowData.at(j);
            switch ( state )
            {
                case kNewFieldStart:
                {
                    if (ch == '"')
                    {
                        state = kQuotesField;
                    }
                    else if (ch == separator.at(0))
                    {
                        vecFields.push_back("");
                        state = kFieldSeparator;
                    }
                    else if (ch == '
' || ch == '
')
                    {
                        state = kRowSeparator;
                    }
                    else
                    {
                        strField.push_back(ch);
                        state = kNonQuotesField;
                    }
                }
                    break;

                case kNonQuotesField:
                {
                    if (ch == separator.at(0))
                    {
                        vecFields.push_back(strField);
                        strField.clear();
                        state = kFieldSeparator;
                    }
                    else if (ch == '
' || ch == '
')
                    {
                        vecFields.push_back(strField);
                        state = kRowSeparator;
                    }
                    else
                    {
                        strField.push_back(ch);
                    }
                }
                    break;

                case kQuotesField:
                {
                    if (ch == '"')
                    {
                        state = kQuoteInQuotesField;
                    }
                    else
                    {
                        strField.push_back(ch);
                    }
                }
                    break;

                case kFieldSeparator:
                {
                    if (ch == separator.at(0))
                    {
                        vecFields.push_back("");
                    }
                    else if (ch == '"')
                    {
                        strField.clear();
                        state = kQuotesField;
                    }
                    else if (ch == '
' || ch == '
')
                    {
                        vecFields.push_back("");
                        state = kRowSeparator;
                    }
                    else
                    {
                        strField.push_back(ch);
                        state = kNonQuotesField;
                    }

                }
                    break;

                case kQuoteInQuotesField:
                {
                    if (ch == separator.at(0))
                    {
                        //引号闭合
                        vecFields.push_back(strField);
                        strField.clear();
                        state = kFieldSeparator;
                    }
                    else if (ch == '
' || ch == '
')
                    {
                        vecFields.push_back(strField);
                        state = kRowSeparator;
                    }
                    else if (ch == '"')
                    {
                        //转义引号
                        strField.push_back(ch);
                        state = kQuotesField;
                    }
                    else
                    {
                        //引号字段里包含引号时,需要对内引号进行加引号转义
                        std::cout<<"语法错误: 转义字符 " 不能完成转义 或 引号字段结尾引号没有紧贴字段分隔符"<<std::endl;
                        assert(false);
                    }

                }
                    break;

                case kRowSeparator:
                {
                    _gridData.push_back(vecFields);
                    continue;
                }
                    break;

                case kError:
                {

                }
                    break;

                default:
                    break;
            }

        }

        switch (state)
        {
            case kNonQuotesField:
            {
                vecFields.push_back(strField);
                _gridData.push_back(vecFields);
            }
                break;
            case kQuoteInQuotesField:
            {
                vecFields.push_back(strField);
                _gridData.push_back(vecFields);
            }
                break;
            case kFieldSeparator:
            {
                vecFields.push_back("");
                _gridData.push_back(vecFields);
            }
                break;
            case kRowSeparator:
            {
                _gridData.push_back(vecFields);
            }
                break;
            default:
                break;
        }
    }


    float used = (float)(clock()-before)/CLOCKS_PER_SEC;
    std::cout<<"解析此csv花费时间:"<<used<<"S"<<std::endl;

    return _gridData;
}

void CCSVParse::printParseData() const
{
    std::cout<<"以下是解析的csv数据:"<<std::endl;
    std::cout<<"row counts:"<<_gridData.size()<<std::endl;

    for (int row=0; row<_gridData.size(); row++)
    {
        std::vector<std::string> rowData = _gridData.at(row);
        for (int col = 0; col<rowData.size(); col++)
        {
            std::cout<<rowData.at(col)<<"	";
        }
        std::cout<<"
"<<std::endl;
    };
}

std::map<std::string, std::map<std::string, std::string> > CCSVParse::parseCsvFileToMap(const std::string &fileName,const std::string &separator)
{
    //先获取所有的行列数据
    std::vector<std::vector<std::string>> allData = parseCsvFile(fileName,separator);
    //转为字典形式
    std::map<std::string, std::map<std::string, std::string> > mapAllData;
    for (size_t i=1; i<allData.size(); i++)
    {
        std::vector<std::string> rowData = allData.at(i);
        //数据第一行为数据类型key
        std::vector<std::string> keyData = allData.at(0);

        std::map<std::string, std::string> mapRow;
        for (int i=0; i<keyData.size(); i++)
        {
            std::string key = keyData.at(i);
            std::string value = rowData.at(i);
            mapRow[key] = value;
        }
        //每一行数据的第一列是id
        mapAllData[rowData.at(0)] = mapRow;
    }

    return mapAllData;
}

#pragma mark--全局函数 类型转换
int conToInt(std::string &source)
{
    std::stringstream ss;
    int res;
    ss<<source;
    ss>>res;

    return res;
}
float conToFloat(std::string &source)
{
    std::stringstream ss;
    float res;
    ss<<source;
    ss>>res;

    return res;
}
double conToDouble(std::string &source)
{
    std::stringstream ss;
    double res;
    ss<<source;
    ss>>res;

    return res;
}

std::string conToString(int s)
{
    std::stringstream ss;
    std::string res;
    ss<<s;
    ss>>res;

    return res;
}
std::string conToString(float s)
{
    std::stringstream ss;
    std::string res;
    ss<<s;
    ss>>res;

    return res;
}
std::string conToString(double s)
{
    std::stringstream ss;
    std::string res;
    ss<<s;
    ss>>res;

    return res;
}

测试:
test.csv:
fid,name,age
1,小明,20
2,小刚,20
3,小红,19

CCSVParse *parse = new CCSVParse();
//解析成数组
vector<vector<string>> s = parse->parseCsvFile("test.csv");
parse->printParseData();
//解析成字典
std::map<std::string, std::map<std::string, std::string> > mapData = parse->parseCsvFileToMap("testcsv");
std::map<std::string, std::string> mapXiaom = mapData.at("1");
std::string name = mapXiaom.at("name"); //小明
int age = conToInt(mapXiao.at("age")); //20

delete parse;

解析结果:
1 小明 20
2 小刚 20
3 小红 19

下载链接:百度云盘下载

原文地址:https://www.cnblogs.com/skyxu123/p/9543806.html