We modified our CATS 0.8.0 to handle WordPerfect, Works, Docx, and ODT documents. On CentOS 5.x, we had to download and compile libwps to handle Works. We downloaded and compiled doctotext from silvercoders.com (GPL) to handle docx conversion. WordPerfect conversion is handled by libwpd (EPEL repo?). ODT conversion is handled by odt2txt (RPMForge repo).
We're currently working on teaching CATS how to handle (OCR) image-based PDF files. When we get that code ready, I will post it too.
Switch command in DocumentToText.php:
Code: Select all /* Use different methods to extract text depending on the type of document. */
switch ($documentType)
{
case DOCUMENT_TYPE_DOC:
if (ANTIWORD_PATH == '')
{
$this->_setError('The DOC format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$command = '"'. ANTIWORD_PATH . '" -m ' . ANTIWORD_MAP . ' '
. $escapedFilename;
break;
case DOCUMENT_TYPE_PDF:
if (PDFTOTEXT_PATH == '')
{
$this->_setError('The PDF format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$convertEncoding = false;
$command = '"'. PDFTOTEXT_PATH . '" -layout ' . $escapedFilename . ' -';
break;
case DOCUMENT_TYPE_HTML:
if (HTML2TEXT_PATH == '')
{
$this->_setError('The HTML format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$convertEncoding = false;
$command = '"'. HTML2TEXT_PATH . '" -nobs ' . $escapedFilename;
break;
case DOCUMENT_TYPE_TEXT:
return $this->_readTextFile($fileName);
break;
case DOCUMENT_TYPE_RTF;
if (HTML2TEXT_PATH == '')
{
$this->_setError('The HTML format has not been configured, which is required for the RTF format.');
return false;
}
if (UNRTF_PATH == '')
{
$this->_setError('The RTF format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$convertEncoding = false;
$command = '"'. UNRTF_PATH . '" '.$escapedFilename.' | "'. HTML2TEXT_PATH . '" -nobs ';
break;
case DOCUMENT_TYPE_ODT:
if (ODT2TXT_PATH == '')
{
$this->_setError('The ODT format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$command = '"'. ODT2TXT_PATH . '" ' . $escapedFilename;
break;
case DOCUMENT_TYPE_DOCX:
if (DOCTOTEXT_PATH == '')
{
$this->_setError('The DOCX format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$command = '"'. DOCTOTEXT_PATH . '" ' . $escapedFilename;
break;
case DOCUMENT_TYPE_WPD:
if (WPD2TEXT_PATH == '')
{
$this->_setError('The WPD format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$command = '"'. WPD2TEXT_PATH . '" ' . $escapedFilename;
break;
case DOCUMENT_TYPE_WPS:
if (WPS2TEXT_PATH == '')
{
$this->_setError('The WPS format has not been configured.');
return false;
}
$nativeEncoding = 'ISO-8859-1';
$command = '"'. WPS2TEXT_PATH . '" ' . $escapedFilename;
break;
case DOCUMENT_TYPE_UNKNOWN:
default:
$this->_setError('This file format is unknown format and is not yet supported by CATS.');
return false;
break;
}
Defines in FileUtility.php:
Code: Select alldefine('DOCUMENT_TYPE_UNKNOWN', 0);
define('DOCUMENT_TYPE_PDF', 100);
define('DOCUMENT_TYPE_DOC', 200);
define('DOCUMENT_TYPE_RTF', 300);
define('DOCUMENT_TYPE_DOCX', 400);
define('DOCUMENT_TYPE_HTML', 500);
define('DOCUMENT_TYPE_ODT', 600);
define('DOCUMENT_TYPE_TEXT', 700);
define('DOCUMENT_TYPE_WPD', 800);
define('DOCUMENT_TYPE_WPS', 900);
getDocumentType function in FileUtility.php:
Code: Select all public static function getDocumentType($filename, $contentType = false)
{
$fileExtension = self::getFileExtension($filename);
if ($contentType === 'text/plain' || $fileExtension == 'txt')
{
return DOCUMENT_TYPE_TEXT;
}
if ($contentType == 'application/rtf' || $contentType == 'text/rtf' ||
$contentType == 'text/richtext' || $fileExtension == 'rtf')
{
return DOCUMENT_TYPE_RTF;
}
if ($contentType == 'application/msword' || $fileExtension == 'doc')
{
return DOCUMENT_TYPE_DOC;
}
if ($contentType == 'application/vnd.ms-word.document.12' ||
$fileExtension == 'docx')
{
return DOCUMENT_TYPE_DOCX;
}
if ($contentType == 'application/pdf' || $fileExtension == 'pdf')
{
return DOCUMENT_TYPE_PDF;
}
if ($contentType === 'text/html' || $fileExtension == 'html' ||
$fileExtension == 'htm')
{
return DOCUMENT_TYPE_HTML;
}
if ($contentType === 'application/vnd.oasis.opendocument.text' ||
$contentType === 'application/x-vnd.oasis.opendocument.text' ||
$fileExtension == 'odt')
{
return DOCUMENT_TYPE_ODT;
}
if ($contentType === 'application/wordperfect' || $fileExtension == 'wpd')
{
return DOCUMENT_TYPE_WPD;
}
if ($contentType === 'application/vnd.ms-works' ||
$contentType === 'application/x-msworks-wp' ||
$contentType === 'zz-application/zz-winassoc-wps' ||
$fileExtension == 'wps')
{
return DOCUMENT_TYPE_WPS;
}
return DOCUMENT_TYPE_UNKNOWN;
}
Parser settings in Config.php:
Code: Select all/* Text parser settings. Remember to use double backslashes (\) to represent
* one backslash (\). On Windows, installing in C:\antiword\ is
* recomended, in which case you should set ANTIWORD_PATH (below) to
* 'C:\\antiword\\antiword.exe'. Windows Antiword will have problems locating
* mapping files if you install it anywhere but C:\antiword\.
*/
define('ANTIWORD_PATH', "/usr/bin/antiword");
define('ANTIWORD_MAP', '8859-1.txt');
/* XPDF / pdftotext settings. Remember to use double backslashes (\) to represent
* one backslash (\).
* http://www.foolabs.com/xpdf/
*/
define('PDFTOTEXT_PATH', "/usr/bin/pdftotext");
/* html2text settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'html2text' can be found at:
* http://www.mbayer.de/html2text/
*/
define('HTML2TEXT_PATH', "/usr/bin/html2text");
/* UnRTF settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'unrtf' can be found at:
* http://www.gnu.org/software/unrtf/unrtf.html
*/
define('UNRTF_PATH', "/usr/bin/unrtf");
/* ODT2TXT settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'odt2txt' can be found at:
* http://stosberg.net/odt2txt/
*/
define('ODT2TXT_PATH', "/usr/bin/odt2txt");
/* DOCTOTEXT settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'doctotext' can be found at:
* http://sourceforge.net/projects/doctotext/
*/
/* WPD2TEXT settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'wpd2text' can be found at:
* http://libwpd.sourceforge.net/
*/
define('WPD2TEXT_PATH', "/usr/bin/wpd2text");
/* WPS2TEXT settings. Remember to use double backslashes (\) to represent
* one backslash (\). 'wps2text' can be found at:
* http://libwps.sourceforge.net/
*/
define('WPS2TEXT_PATH', "/usr/bin/wps2text");