用正则表达式和java解析csv文件

在解析csv文件之前,先来看看什么是csv文件以及csv文件的格式。

csv(Comma Separate Values)文件即逗号分隔符文件,它是一种文本文件,可以直接以文本打开,以逗号分隔。windows默认用excel打开。它的格式包括以下几点(它的格式最好就看excel是如何解析的。):

①每条记录占一行;
②以逗号为分隔符;
③逗号前后的空格会被忽略;
④字段中包含有逗号,该字段必须用双引号括起来;
⑤字段中包含有换行符,该字段必须用双引号括起来;
⑥字段前后包含有空格,该字段必须用双引号括起来;
⑦字段中的双引号用两个双引号表示;
⑧字段中如果有双引号,该字段必须用双引号括起来;
⑨第一条记录,可以是字段名;

⑩以上提到的逗号和双引号均为半角字符。

下面通过正则表达式和java解析csv文件。

首先给出匹配csv文件的一个最小单位数据的正则表达式(如:1,2,3是csv文件的一行数据,则1,是该csv文件的一个最小单位数据):

"(([^",\n  ]*[,\n  ])*([^",\n  ]*"{2})*)*[^",\n  ]*"[  ]*,[  ]*|[^",\n]*[  ]*,[  ]*|"(([^",\n  ]*[,\n  ])*([^",\n  ]*"{2})*)*[^",\n  ]*"[  ]*|[^",\n]*[  ]*

 

下面是解析文件的java代码:

package myutil;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author  panhf2003
 * @version 2008/09/05,
 */

public class CsvFileUtil {

    /**
     * 构造,禁止实例化
     */
    private CsvFileUtil() {
    }

    public static void main(String[] args) {

        // test
        try {
            readCsvFile("e:\\test1.csv");
        } catch (FileNotFoundException ex) {
            Logger.getLogger(CsvFileUtil.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(CsvFileUtil.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    /**
     * csv文件读取<BR/>
     * 读取绝对路径为argPath的csv文件数据,并以List返回。
     *
     * @param argPath csv文件绝对路径
     * @return csv文件数据(List<String[]>)
     * @throws FileNotFoundException
     * @throws IOException
     */
    public static List readCsvFile(String argPath) throws FileNotFoundException, IOException {
        CsvFileUtil util = new CsvFileUtil();
        File cvsFile = new File(argPath);
        List list = new ArrayList();
        FileReader fileReader = null;
        BufferedReader bufferedReader = null;
        try {
            fileReader = new FileReader(cvsFile);
            bufferedReader = new BufferedReader(fileReader);
            String regExp = util.getRegExp();

            // test
            System.out.println(regExp);
            String strLine = "";
            String str = "";
            while ((strLine = bufferedReader.readLine()) != null) {
                Pattern pattern = Pattern.compile(regExp);
                Matcher matcher = pattern.matcher(strLine);
                List listTemp = new ArrayList();
                while(matcher.find()) {
                    str = matcher.group();
                    str = str.trim();
                    if (str.endsWith(",")){
                        str = str.substring(0, str.length()-1);
                        str = str.trim();
                    }
                    if (str.startsWith("\"") && str.endsWith("\"")) {
                        str = str.substring(1, str.length()-1);
                        if (util.isExisted("\"\"", str)) {
                            str = str.replaceAll("\"\"", "\"");
                        }
                    }
                    if (!"".equals(str)) {
                        //test
                        System.out.print(str+" ");
                        listTemp.add(str);
                    }
                }
                //test
                System.out.println();
                list.add((String[]) listTemp.toArray(new String[listTemp.size()]));
            }
        } catch (FileNotFoundException e) {
            throw e;
        } catch (IOException e) {
            throw e;
        } finally {
            try {
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
                if (fileReader != null) {
                    fileReader.close();
                }
            } catch (IOException e) {
                throw e;
            }
        }
        return list;
    }
    
    /**
     * csv文件做成<BR/>
     * 将argList写入argPath路径下的argFileName文件里。
     *
     * @param argList  要写入csv文件的数据(List<String[]>)
     * @param argPath csv文件路径
     * @param argFileName csv文件名
     * @param isNewFile 是否覆盖原有文件
     * @throws IOException
     * @throws Exception
     */
    public static void writeCsvFile(List argList, String argPath, String argFileName, boolean isNewFile)
        throws IOException, Exception {
        CsvFileUtil util = new CsvFileUtil();
        // 数据check
        if (argList == null || argList.size() == 0) {
            throw new Exception("没有数据");
        }
        for (int i = 0; i < argList.size(); i++) {
            if (!(argList.get(i) instanceof String[])) {
                throw new Exception("数据格式不对");
            }
        }
        FileWriter fileWriter = null;
        BufferedWriter bufferedWriter = null;
        String strFullFileName = argPath;
        if (strFullFileName.lastIndexOf("\\") == (strFullFileName.length() - 1)) {
            strFullFileName += argFileName;
        } else {
            strFullFileName += "\\" + argFileName;
        }
        File file = new File(strFullFileName);
        // 文件路径check
        if (!file.getParentFile().exists()) {
            file.getParentFile().mkdirs();
        }
        try {
            if (isNewFile) {
                // 覆盖原有文件
                fileWriter = new FileWriter(file);
            } else {
                // 在原有文件上追加数据
                fileWriter = new FileWriter(file, true);
            }
            bufferedWriter = new BufferedWriter(fileWriter);
            for (int i = 0; i < argList.size(); i++) {
                String[] strTemp = (String[]) argList.get(i);
                for (int j = 0; j < strTemp.length; j++) {
                    if (util.isExisted("\"",strTemp[j])) {
                        strTemp[j] = strTemp[j].replaceAll("\"", "\"\"");
                        bufferedWriter.write("\""+strTemp[j]+"\"");
                    } else if (util.isExisted(",",strTemp[j])
                            || util.isExisted("\n",strTemp[j])
                            || util.isExisted(" ",strTemp[j])
                            || util.isExisted("��",strTemp[j])){
                        bufferedWriter.write("\""+strTemp[j]+"\"");
                    } else {
                        bufferedWriter.write(strTemp[j]);
                    }
                    if (j < strTemp.length - 1) {
                        bufferedWriter.write(",");
                    }
                }
                bufferedWriter.newLine();
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (bufferedWriter != null) {
                    bufferedWriter.close();
                }
                if (fileWriter != null) {
                    fileWriter.close();
                }
            } catch (IOException e) {
                throw e;
            }
        }
    }
    
    /**
     * @param argChar
     * @param argStr
     * @return
     */
    private boolean isExisted(String argChar, String argStr) {
        
        boolean blnReturnValue = false;
        if ((argStr.indexOf(argChar) >= 0)
                && (argStr.indexOf(argChar) <= argStr.length())) {
            blnReturnValue = true;
        }
        return blnReturnValue;
    }
    
    /**
     * 正则表达式。
     * @return 匹配csv文件里最小单位的正则表达式。
     */
    private String getRegExp() {
        
        String strRegExp = "";
        
        strRegExp =
            "\"(("+ SPECIAL_CHAR_A + "*[,\\n  ])*("+ SPECIAL_CHAR_A + "*\"{2})*)*"+ SPECIAL_CHAR_A + "*\"[  ]*,[  ]*"
            +"|"+ SPECIAL_CHAR_B + "*[  ]*,[  ]*"
            + "|\"(("+ SPECIAL_CHAR_A + "*[,\\n  ])*("+ SPECIAL_CHAR_A + "*\"{2})*)*"+ SPECIAL_CHAR_A + "*\"[  ]*"
            + "|"+ SPECIAL_CHAR_B + "*[  ]*";
        
        return strRegExp;
    }
    
    private static final String SPECIAL_CHAR_A = "[^\",\\n  ]";
    private static final String SPECIAL_CHAR_B = "[^\",\\n]";
}




本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/panhf2003/archive/2008/09/16/2937853.aspx
原文地址:https://www.cnblogs.com/pricks/p/1503810.html