java使用tika批量识别文件的真实mime类型

生产环境中,服务器使用JDK1.7,服务器上了为了限制文件类型,现在想把已上传类型进行汇总。

idea中新建maven项目,选择quickstart

pom.xml

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>

  <groupId>com.h2</groupId>
  <artifactId>mimetype</artifactId>
  <version>1.0</version>

  <name>mimetype</name>

  <properties>
    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
  </properties>

  <dependencies>

    <!-- mime type-->
    <dependency>
      <groupId>org.apache.tika</groupId>
      <artifactId>tika-core</artifactId>
      <version>1.18</version>
    </dependency>
  </dependencies>

  <build>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <version>2.15.2</version>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
      </plugin>

      <plugin>
        <artifactId>maven-compiler-plugin</artifactId>
        <version>3.6.0</version>
        <configuration>
          <source>1.7</source>
          <target>1.7</target>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-assembly-plugin</artifactId>
        <version>2.3</version>
        <configuration>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
      </plugin>

      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-surefire-plugin</artifactId>
        <version>2.19</version>
        <configuration>
          <skip>true</skip>
        </configuration>
      </plugin>

      <plugin><!--包含class目录资源文件-->
        <groupId>org.codehaus.mojo</groupId>
        <artifactId>build-helper-maven-plugin</artifactId>
        <version>1.8</version>
        <executions>
          <execution>
            <id>add-resource</id>
            <phase>generate-resources</phase>
            <goals>
              <goal>add-resource</goal>
            </goals>
            <configuration>
              <resources>
                <resource>
                  <directory>src/main/java</directory>
                  <includes>
                    <include>com/netmarch/*.txt</include>
                  </includes>
                </resource>
              </resources>
            </configuration>
          </execution>
        </executions>
      </plugin>

      <plugin><!--将第三方的jar文件打包进来-->
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-shade-plugin</artifactId>
        <version>2.4.3</version>
        <executions>
          <execution>
            <phase>package</phase>
            <goals>
              <goal>shade</goal>
            </goals>
            <configuration>
              <filters>
                <filter>
                  <artifact>*:*</artifact>
                  <excludes>
                    <exclude>META-INF/*.SF</exclude>
                    <exclude>META-INF/*.DSA</exclude>
                    <exclude>META-INF/*.RSA</exclude>
                  </excludes>
                </filter>
              </filters>
              <transformers>
                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                  <mainClass>com.h2.MimeTypeMain</mainClass><!--main方法所在类-->
                </transformer>
              </transformers>
            </configuration>
          </execution>
        </executions>
      </plugin>
    </plugins>
    <defaultGoal>package</defaultGoal>
  </build>
</project>

MimeTypeMain.java

package com.h2;

import org.apache.tika.Tika;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class MimeTypeMain {

    Tika tika = new Tika();
    public static void main(String[] args){
        int length = args.length;
        if(length<2)
        {
            System.out.printf("usage: MimeTypeMain dir out.txt");
        }else
        {
            MimeTypeMain main = new MimeTypeMain();
            main.walk(args[0],args[1]);
        }
    }

    public void walk(String dir,String out){
        File dirs = new File(dir);
        File outFile = new File(out);
        if(!outFile.exists()){
            try {
                outFile.createNewFile();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        File[] files = dirs.listFiles();
        for (int i = 0; i < files.length; i++) {
            File file = files[i];
            if(file.isFile())
            {
                try {
                    String mimeType = tika.detect(file);
                    wirteToFile(out, String.format("mimeType:%s , path:%s
", mimeType, file.getAbsolutePath()));
                }catch (IOException e)
                {
                    e.printStackTrace();;
                }
            }else{
            String path = file.getAbsolutePath();
            walk(path,out);
            }
        }
    }

    void wirteToFile(String outFile,String content) throws IOException {
        try(FileWriter fileWritter = new FileWriter(outFile,true)) {
            fileWritter.write(content);
        }
    }
}

mvn package 生成可执行文件

 本地测试样本

 运行程序

java -jar mimetype-1.0.jar d:样本 d:1.txt

结果如下:

mimeType:application/msword , path:D:样本1.企业技术开发项目设计书(下进风机柜) V161015.doc
mimeType:application/vnd.ms-excel , path:D:样本12月份利润表1.xls
mimeType:application/vnd.ms-excel , path:D:样本12月份资产负债表1.xls
mimeType:application/zip , path:D:样本18年度公司财务报表.zip
mimeType:image/jpeg , path:D:样本1b125ae7ef59b854685cc8d6af8645c7.jpg
mimeType:application/x-rar-compressed , path:D:样本2018财务报表.rar
mimeType:image/tiff , path:D:样本5.4-专利受理通知书-一种下托盘摩擦焊工装夹具.tif
mimeType:image/jpeg , path:D:样本6E9D2271-1CB1-45AE-858D-4502F5EB2096.jpeg
mimeType:application/pdf , path:D:样本ASR手册.2019-10-12.pdf
mimeType:image/png , path:D:样本default_av_boy_v3.png
mimeType:image/png , path:D:样本default_av_girl_v3.png
mimeType:image/jpeg , path:D:样本ECF4A384BD56535EFB3335C39F778023.png
mimeType:image/jpeg , path:D:样本F48AF50EFD9316C865A832888DA8AEF1.png
mimeType:image/bmp , path:D:样本三证合一副本.bmp
mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备研发项目设计书
mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备项目研发情况表
mimeType:application/pdf , path:D:样本喇叭盖智能点胶组装设备项目研发立项决议
mimeType:application/pdf , path:D:样本塑料旋转开关项目情况表
mimeType:application/x-7z-compressed , path:D:样本新建文件夹 (3).7z
mimeType:application/x-tika-msoffice , path:D:样本江苏省企业研发项目情况表 1
mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发立项决议
mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发项目情况表
mimeType:application/pdf , path:D:样本汽车安全系统组件智能装配线研发项目设计书
mimeType:image/jpeg , path:D:样本项目情况3-1
mimeType:image/jpeg , path:D:样本项目情况3-2
mimeType:application/pdf , path:D:样本项目情况表
mimeType:application/pdf , path:D:样本项目立项决议
mimeType:application/pdf , path:D:样本项目设计书

 一些常见的mime type类型

image/bmp

image/x-bitmap

image/x-pixmap

image/jpg
image/png

image/jpeg
application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx
application/x-rar-compressed
application/vnd.openxmlformats-officedocument.wordprocessingml.document docx
application/zip zip
image/tiff tiff
image/bmp bmp
application/rtf rtf
application/x-tika-ooxml xls
application/x-bplist pdf
application/pdf
application/vnd.ms-word.document.macroenabled.12 docm
image/gif
application/vnd.openxmlformats-officedocument.presentationml.presentation pptx
application/x-tika-msoffice pdf
application/msword
application/x-7z-compressed 7z
application/vnd.ms-xpsdocument xps

项目中使用的代码

final  Tika tika = new Tika();

Set<String> allowMimeType =
        ImmutableSet.of("image/pjpeg","application/pdf","application/msword","image/jpeg",
                "image/x-png","image/tiff","application/vnd.ms-excel","application/zip",
                "image/bmp","image/x-bitmap","image/x-pixmap","image/jpg",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" /*xlsx*/
                ,"application/x-rar-compressed","application/rtf","application/x-tika-ooxml",/*xls*/
                "application/x-bplist"/*pdf*/,"application/pdf",
                "application/vnd.ms-word.document.macroenabled.12"/*docm*/,"application/x-tika-msoffice"/*pdf*/,
                "application/vnd.openxmlformats-officedocument.presentationml.presentation"/*pptx*/
                ,"application/x-7z-compressed","application/vnd.ms-xpsdocument"/*xps*/);

byte[] byteInfoFile = prjInfoFile.getBytes();

if(byteInfoFile.length>0) {

    String mimeType = tika.detect(prjDesignFile.getBytes());

    log.info("文件类型:{}",mimeType);

    if(!allowMimeType.contains(mimeType))
    {
        return "error:服务器暂不接受此类型的附件";
    }    byte[] byteInfoFile = prjInfoFile.getBytes();
}
    
原文地址:https://www.cnblogs.com/passedbylove/p/12732280.html