JAVA读取WORD,EXCEL,POWERPOINT,PDF文件的方法
来源:优易学  2011-11-29 16:15:45   【优易学:中国教育考试门户网】   资料下载   IT书店

  OFFICE文档使用POI控件,PDF可以使用PDFBOX0.7.3控件,完全支持中文,用XPDF也行,不过感觉PDFBOX比较好,而且作者也在更新。水平有限,万望各位指正

  WORD:

  import org.apache.lucene.document.Document;

  import org.apache.lucene.document.Field;

  import org.apache.poi.hwpf.extractor.WordExtractor;

  import java.io.File;

  import java.io.InputStream;

  import java.io.FileInputStream;

  import com.search.code.Index;

  public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {

   String bodyText = null;

  try {

  WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream

  bodyText = ex.getText();

  if(!bodyText.equals("")){

  index.AddIndex(url, title, bodyText);

  }

  }catch (DocCenterException e) {

  throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);

  }catch(Exception e){

  e.printStackTrace();

  }

  }

  return null;

  }

  Excel:

  import org.apache.lucene.document.Document;

  import org.apache.lucene.document.Field;

  import org.apache.poi.hwpf.extractor.WordExtractor;

  import org.apache.poi.hssf.usermodel.HSSFWorkbook;

  import org.apache.poi.hssf.usermodel.HSSFSheet;

  import org.apache.poi.hssf.usermodel.HSSFRow;

  import org.apache.poi.hssf.usermodel.HSSFCell;

  import java.io.File;

  import java.io.InputStream;

  import java.io.FileInputStream;

  import com.search.code.Index;

  public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {

  StringBuffer content = new StringBuffer();

  try{

  HSSFWorkbook workbook = new HSSFWorkbook(is);//创建对Excel工作簿文件的引用

  for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {

  if (null != workbook.getSheetAt(numSheets)) {

  HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet

  for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {

  if (null != aSheet.getRow(rowNumOfSheet)) {

  HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行

  for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {

  if (null != aRow.getCell(cellNumOfRow)) {

  HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值

  content.append(aCell.getStringCellValue());

  }

  }

  }

  }

  }

  }

  if(!content.equals("")){

  index.AddIndex(url, title, content.toString());

  }

  }catch (DocCenterException e) {

   throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);

  }catch(Exception e) {

  System.out.println("已运行xlRead() : " + e );

  }

  return null;

  }

  PowerPoint:

  import java.io.InputStream;

  import org.apache.lucene.document.Document;

  import org.apache.poi.hslf.HSLFSlideShow;

  import org.apache.poi.hslf.model.TextRun;

  import org.apache.poi.hslf.model.Slide;

  import org.apache.poi.hslf.usermodel.SlideShow;

  public Document getDocument(Index index, String url, String title, InputStream is)

  throws DocCenterException {

  StringBuffer content = new StringBuffer("");

  try{

[1] [2] 下一页

责任编辑:小草

文章搜索:
 相关文章
热点资讯
热门课程培训