diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c | 910 |
1 files changed, 910 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c new file mode 100644 index 00000000..d49140d4 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/rtf2html/rtf2html.c @@ -0,0 +1,910 @@ +/* RTF2HTML.c, Chuck Shotton - 6/21/93 */ +/************************************************************************ + * This program takes a stab at converting RTF (Rich Text Format) files + * into HTML. There are some limitations that keep RTF from being able to + * easily represent things like in-line images and anchors as styles. In + * particular, RTF styles apply to entire "paragraphs", so anchors or + * images in the middle of a text stream can't easily be represented by + * styles. The intent is to ultimately use something like embedded text + * color changes to represent these constructs. + * + * In the meantime, you can take existing Word documents, apply the + * correct style sheet, and convert them to HTML with this tool. + * + * AUTHOR: Chuck Shotton, UT-Houston Academic Computing, + * + * Dmitry Potapov, CapitalSoft + * + * David Lippi, Comune di Prato, Italy + * + * Gabriele Bartolini, Comune di Prato, Italy + * + * USAGE: rtf2html [rtf_filename] + * + * BEHAVIOR: + * rtf2html will open the specified RTF input file or read from + * standard input, writing converted HTML to standard output. + * + * NOTES: + * The RTF document must be formatted with a style sheet that has + * style numberings that conform to the style_mappings table + * defined in this source file. Characters are converted according + * to the ANSI Windows 1252 code or Macintosh. + * + * MODIFICATIONS: + * 6/21/93 : Chuck Shotton - created version 1.0. + * 11/26/98 : Dmitry Potapov - version 1.1 beta + * 05/07/04 : David Lippi, Gabriele Bartolini - version 1.2 + * + * Copyright (C) 2004 Comune di Prato + * + * For copyright details, see the file COPYING in your distribution + * or the GNU General Public License (GPL) version 2 or later + * <http://www.gnu.org/copyleft/gpl.html> + * + ************************************************************************/ + +/* Note, the source is formated with 4 character tabs */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include "charset1252.h" +#include "charsetmac.h" + +#ifdef _MSC_VER +# define strcasecmp _stricmp +#endif + +#ifndef TRUE +#define TRUE -1 +#define FALSE 0 +#endif + +#define MAX_LEVELS 40 /*defines the # of nested in-line styles (pairs of {})*/ +#define MAX_RTF_TOKEN 40 + +#define MAX_INLINE_STYLES 5 /*defines # of in-line styles, bold, italic, etc.*/ + +typedef struct tag_StyleState +{ + unsigned char s: MAX_INLINE_STYLES; +} TStyleState; + +typedef enum { s_plain, s_bold, s_italic, s_underline, s_hidden, /*in-line styles*/ + s_para, s_br, /*pseudo style*/ + s_h0, s_h1, s_h2, s_h3, s_h4, s_h5, s_h6 /*heading styles*/ +} StyleState; + +char *styles[][2] = { /*HTML Start and end tags for styles*/ + {"", ""}, + {"<strong>", "</strong>"}, + {"<em>", "</em>"}, + {"", ""}, + {"<!-- ", " -->"}, + {"\n", "\n"}, /* {"\n<p>", "</p>\n"}, */ + {"<br />\n",""}, + {"", ""}, + {"<h1>", "</h1>"}, + {"<h2>", "</h2>"}, + {"<h3>", "</h3>"}, + {"<h4>", "</h4>"}, + {"<h5>", "</h5>"}, + {"<h6>", "</h6>"} +}; + +/* style_mappings maps the style numbers in a RTF style sheet into one of the*/ +/* (currently) six paragraph-oriented HTML styles (i.e. heading 1 through 6.)*/ +/* Additional styles for lists, etc. should be added here. Style info */ +/* ultimately should be read from some sort of config file into these tables.*/ + +#define MAX_NAME_LEN 40 +char style_name[MAX_NAME_LEN]; + +#define STYLE_NUMBER 7 +char *style_namings[STYLE_NUMBER] = { + "", "heading 1", "heading 2", "heading 3", "heading 4", "heading 5", + "heading 6" +}; +char style_mappings[STYLE_NUMBER][MAX_RTF_TOKEN]; +char style_number[MAX_RTF_TOKEN]; + +/* RTF tokens that mean something to the parser. All others are ignored. */ + +typedef enum { + t_start, + t_fonttbl, t_colortbl, t_stylesheet, t_info, t_s, t_b, t_ul, t_ulw, + t_uld, t_uldb, t_i, t_v, t_plain, t_par, t_pict, t_tab, t_bullet, + t_cell, t_row, t_line, t_endash, t_emdash, t_rquote, + t_end +} TokenIndex; + +char *tokens[] = { + "###", + "fonttbl", "colortbl", "stylesheet", "info", "s", "b", "ul", "ulw", + "uld", "uldb", "i", "v", "plain", "par", "pict", "tab", "bullet", + "cell", "row", "line", "endash", "emdash", "rquote", + "###" +}; + +TStyleState style_state[MAX_LEVELS], curr_style; +short curr_heading; + +void (*RTF_DoControl)(FILE*,char*,char*); +char isBody; +char* title; +//FILE* f; + +short level, /*current {} nesting level*/ + skip_to_level,/*{} level to which parsing should skip (used to skip */ + /* font tables, style sheets, color tables, etc.) */ + gobble, /*Flag set to indicate all input should be discarded */ + ignore_styles;/*Set to ignore inline style expansions after style use*/ + +/* Charset */ +unsigned char** charset_table; + +#define CHARSET_DEFAULT 0 // Index of the default charset to use +#define CHARSET_NUMBER 2 // Number of charset used +#define CHARSET_MAX_LENGTH 20 // Max numbero of char in the charset +// metadata used in rtf standard for the charset definition +unsigned char *charset[CHARSET_NUMBER] = { + "ansi", + "mac" +}; +// variable with the charset definition +unsigned char **charset_variablename[CHARSET_NUMBER] = { + charset1252, + mac +}; + +/**************************************/ + +int openfile (char * filename, FILE ** f) +{ + int rv = 1; + + if (filename) + { + if (!(*f = fopen (filename, "r"))) + { + fprintf (stderr, "\nError: Input file %s not found.\n", filename); + rv = 0; + } + else + { + title = filename; + } + } + else + { + *f = stdin; + title="STDIN"; + } + return rv; +} + +/**************************************/ + +int closefile (FILE * f) +{ + return fclose (f); +} + +/**************************************/ + +char RTF_GetChar( FILE* f ) +{ + char ch; + do + { + ch = fgetc( f ); + } while ((ch=='\r')||(ch=='\n')); + return ch; +} + +/**************************************/ + +char RTF_UnGetChar(FILE* f, char ch) +{ + return ungetc(ch, f); +} + +/**************************************/ + +void RTF_PutStr(char* s) +{ + if (gobble) return; + fputs(s, stdout); +} + +/**************************************/ + +void RTF_PutHeader() +{ + RTF_PutStr("<head>\n<title>"); + RTF_PutStr(title); + RTF_PutStr("</title>\n"); + RTF_PutStr("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n"); + RTF_PutStr("</head>\n"); +} + +/**************************************/ + +void RTF_PutChar(char ch) +{ + if (gobble) return; + if (!isBody) + { + RTF_PutHeader(); + RTF_PutStr("<body>\n"); + isBody=TRUE; + } + switch (ch) { + case '<': + RTF_PutStr("<"); + break; + + case '>': + RTF_PutStr(">"); + break; + + case '&': + RTF_PutStr("&"); + break; + + default: + fputc(ch, stdout); + } +} + +/**************************************/ + +void RTF_PlainStyle (TStyleState* s) +{ + int i; + for(i=0;i<MAX_INLINE_STYLES;i++) + { + if(s->s & (1<<i)) + RTF_PutStr(styles[i][1]); + } + s->s=0; +} + +/**************************************/ + +void RTF_SetStyle(TStyleState* s, StyleState style) +{ + if( (!ignore_styles||(style==s_hidden)) && ((s->s&(1<<style))==0) ) + { + RTF_PutStr(styles[style][0]); + s->s|=(1<<style); + } +} + +/**************************************/ + +void RTF_PushState(short* level) +{ + if(*level>=MAX_LEVELS) + { + fprintf(stderr,"Exceed maximum level\n"); + exit(-1); + } + style_state[*level]=curr_style; + (*level)++; +} + +/**************************************/ + +void RTF_PopState(short* level) +{ + int j; + TStyleState new_style; + + if(*level<1) + { + fprintf(stderr,"RTF parse error: unexpected '}'\n"); + exit(-1); + } + new_style = style_state[*level-1]; + /*close off any in-line styles*/ + for (j=0;j<MAX_INLINE_STYLES;j++) + { + if ( ((curr_style.s & (1<<j))!=0) && ((new_style.s & (1<<j))==0) ) + { + curr_style.s &= ~(1<<j); + RTF_PutStr(styles[j][1]); + } + } + + for (j=0;j<MAX_INLINE_STYLES;j++) + { + if( ((curr_style.s & (1<<j))==0) && ((new_style.s & (1<<j))!=0) ) + RTF_PutStr(styles[j][0]); + } + (*level)--; + curr_style = new_style; + + if (*level == skip_to_level) { + skip_to_level = -1; + gobble = FALSE; + } +} + +/**************************************/ +/* Map a style number into a HTML heading */ + +short RTF_MapStyle(char* s) +{ + int i; + for (i=0;i<7;i++) + if (!strcmp(style_mappings[i], s)) + return (i); + return (0); +} + +/**************************************/ + +void RTF_AddStyleMap(char* name, char* number) +{ + int i, len; + len=strlen(name); + if( name[len-1]==';') name[--len]=0; + for(i=0;i<STYLE_NUMBER;i++) + { + if(!strcasecmp(name,style_namings[i])) + { + strcpy(style_mappings[i],number); + return; + } + } +} + +/**************************************/ + +void RTF_BuildName(char* token, char* ch, unsigned is_string) +{ + int len; + char *p; + len = strlen(token); + if(len>=MAX_NAME_LEN-1) + return; + if (is_string) + { + for (p = ch; p && *p; ++p) + { + token[len]=*p; + ++len; + } + } + else + { + token[len] = *ch; + ++len; + } + token[len]='\0'; +} + + +/**************************************/ + +void RTF_ClearName(char* token) +{ + token[0]=0; +} + +/**************************************/ + +TokenIndex GetTokenIndex(char* control) +{ + TokenIndex i; + + for (i=t_start; i<t_end; i++) + { + if(control[0]==tokens[i][0]) /* Added for fast compare */ + { + if (!strcmp(control, tokens[i])) + { + break; + } + } + } + return i; +} + +/**************************************/ + +void RTF_DoStyleControl (FILE* f, char* control, char* arg) +{ + if(GetTokenIndex(control)==t_s) + { + strcpy(style_number,arg); + } +} + +/**************************************/ + +int chartoi(char ch) +{ + if((ch>='0')&&(ch<='9')) + return ch-'0'; + if((ch>='A')&&(ch<='Z')) + return ch-'A'+10; + if((ch>='a')&&(ch<='z')) + return ch-'a'+10; + return -1; +} + +/**************************************/ + +void RTF_BuildArg (FILE * f, char ch, char* arg) +{ + int i=0; + + if(feof(f)) + { + arg[0]=0; + return; + } + if(ch=='-') + { + arg[i++]='-'; + ch = RTF_GetChar( f ); + if(feof(f)) + { + arg[0]=0; + return; + } + } + for(;isdigit(ch);i++) + { + arg[i]=ch; + if(i>=MAX_RTF_TOKEN-1) + { + arg[MAX_RTF_TOKEN-1]=0; + while(isdigit(ch)) { + ch = RTF_GetChar( f ); + if(feof(f)) + return; + } + break; + } + ch = RTF_GetChar( f ); + if(feof(f)) + { + arg[i+1]=0; + return; + } + } + arg[i]=0; + if(!isspace(ch)) + { + RTF_UnGetChar(f, ch); + } +} + +/**************************************/ + +void RTF_BuildToken (FILE* f, char ch) +{ + int i; + + for(i=1;;i++) + { + char token[MAX_RTF_TOKEN], arg[MAX_RTF_TOKEN]; + token[i-1]=ch; + if(i>=MAX_RTF_TOKEN-1) + { + do { + ch = RTF_GetChar( f ); + if(feof(f)) + return; + } while (isalpha(ch)); + RTF_BuildArg(f, ch,arg); + return; + } + ch = RTF_GetChar( f ); + if(feof(f)) + { + token[i]=0; + RTF_DoControl(f,token,""); + return; + } + if( !isalpha(ch) ) + { + token[i]=0; + RTF_BuildArg(f, ch,arg); + RTF_DoControl(f,token,arg); + return; + } + } +} + +/**************************************/ + +void RTF_backslash(FILE* f, char** pch, char* pf) +{ + int ch; + *pf=FALSE; + ch = RTF_GetChar( f ); + if(feof(f)) + { + fprintf(stderr,"Unexpected end of file\n"); + return; + } + switch (ch) + { + case '\\': + *pch=charset_table[92]; *pf=TRUE; + break; + case '{': + *pch=charset_table[123]; *pf=TRUE; + break; + case '}': + *pch=charset_table[125]; *pf=TRUE; + break; + case '*': + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + if(skip_to_level>level-1||skip_to_level==-1) + skip_to_level = level-1; + break; + case '\'': + { + char ch1, ch2; + ch1 = RTF_GetChar( f ); + ch2 = RTF_GetChar( f ); + if(!feof(f)) + { + if(isxdigit(ch1)&&isxdigit(ch2)) + { + ch = chartoi(ch1)*16+chartoi(ch2); + *pch = charset_table[ch-1]; *pf=TRUE; + } else { + fprintf(stderr,"RTF Error: unexpected '%c%c' after \\\'\n",ch1,ch2); + } + } + break; + } + default: + if (isalpha(ch)) + { + RTF_BuildToken(f, ch); + } else { + fprintf(stderr, "\nRTF Error: unexpected '%c' after \\.\n", ch); + } + break; + } +} + +/**************************************/ + +void RTF_ParseStyle(FILE * f) +{ + char ch, pf; + char *code; + int level0; + void (*PrevDoControl)(FILE*,char*,char*); + + level0=level; + PrevDoControl=RTF_DoControl; + RTF_DoControl=RTF_DoStyleControl; + + RTF_ClearName(style_name); + style_number[0]=0; + while (1) + { + ch = RTF_GetChar( f ); + if(feof(f)) + break; + switch (ch) + { + case '\\': + RTF_backslash(f, &code, &pf); + if(pf) + { + RTF_BuildName(style_name, code, 1); + } else { + RTF_ClearName(style_name); + } + break; + + case '{': + level++; + RTF_ClearName(style_name); + break; + + case '}': + if(level0+1==level) + { + if(style_number[0]!=0) + { + RTF_AddStyleMap(style_name,style_number); + style_number[0]=0; + } + } else if(level0==level) { + RTF_DoControl=PrevDoControl; + RTF_UnGetChar(f, ch); + return; + } + level--; + RTF_ClearName(style_name); + break; + + default: + RTF_BuildName(style_name, &ch, 0); + break; + } + } /* while */ +} + +/**************************************/ +/* Perform actions for RTF control words */ + +void RTF_DoBodyControl (FILE * f, char* control,char* arg) +{ + short style; + + if (gobble) return; + + switch (GetTokenIndex(control)) + { + case t_stylesheet: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + skip_to_level = level-1; + RTF_ParseStyle( f ); + break; + case t_fonttbl: /*skip all of these and their contents!*/ + case t_colortbl: + case t_info: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + skip_to_level = level-1; + break; + case t_pict: + gobble = TRUE; /*perform no output, ignore commands 'til level-1*/ + if(skip_to_level>=level || skip_to_level==-1) + skip_to_level = level-1; + break; + + + case t_s: /*Style*/ + if (!curr_heading) + { + style = RTF_MapStyle (arg); + if(style) + { + curr_heading = s_h0 + style; + RTF_PutStr(styles[curr_heading][0]); + ignore_styles = TRUE; + } + } + break; + + case t_b: /*Bold*/ + RTF_SetStyle(&curr_style,s_bold); + break; + + case t_ulw: + case t_uld: + case t_uldb: + case t_ul: /*Underline, maps to "emphasis" HTML style*/ + RTF_SetStyle(&curr_style,s_underline); + break; + + case t_i: /*Italic*/ + RTF_SetStyle(&curr_style,s_italic); + break; + + case t_v: /* Hidden*/ + RTF_SetStyle(&curr_style,s_hidden); + break; + + case t_par: /*Paragraph*/ + if (curr_heading!=s_plain) { + RTF_PutStr(styles[curr_heading][1]); + curr_heading = s_plain; + } else { + RTF_PutStr(styles[s_para][0]); + } + ignore_styles = FALSE; + break; + + case t_plain: /*reset inline styles*/ + RTF_PlainStyle(&curr_style); + break; + case t_cell: + case t_tab: + RTF_PutChar(' '); + break; + case t_endash: + case t_emdash: + RTF_PutChar('-'); + break; + case t_line: + case t_row: + RTF_PutStr(styles[s_br][0]); + break; + case t_bullet: + RTF_PutChar('\xb7'); + break; + case t_start: + case t_end: + break; + case t_rquote: + //RTF_PutStr("’"); + RTF_PutStr("'"); + break; + } + +} + +/**************************************/ +/* RTF_Parse is a crude, ugly state machine that understands enough of */ +/* the RTF syntax to be dangerous. */ + +void RTF_ParseBody( FILE* f ) +{ + char ch, pf; + char* code; + + RTF_DoControl=RTF_DoBodyControl; + level = 0; + skip_to_level = -1; + gobble = FALSE; + ignore_styles = FALSE; + + while (1) + { + ch = RTF_GetChar( f ); + if (feof(f)) + { + break; + } + switch (ch) + { + case '\\': + RTF_backslash(f, &code,&pf); + if(pf && code) + RTF_PutStr(code); + break; + + case '{': + RTF_PushState(&level); + break; + + case '}': + RTF_PopState(&level); + break; + + default: + RTF_PutChar(ch); + break; + } + }/*while*/ +} + +/**************************************/ + +int RTF_Parse (FILE* f) +{ + RTF_PutStr("<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML//EN\">\n<html>\n"); + + isBody=FALSE; + + RTF_ParseBody(f); + + if (isBody) RTF_PutStr("</body>\n"); + + RTF_PutStr("</html>\n"); + + return 0; +} + +/**************************************/ + +void Initialize() +{ + int i; + + for (i=0;i<MAX_LEVELS;i++) + style_state[i].s=s_plain; + + curr_style.s=s_plain; + curr_heading = s_plain; + + // Set default styles maping + style_mappings[0][0]=0; + for(i=1;i<=6;i++) + sprintf(style_mappings[i],"%d",256-i); +} + +/**************************************/ + +int RTF_FindCharset(FILE * f) +{ + char ch; + char code[CHARSET_MAX_LENGTH]; + int metadata = 0; + int i = 0; + + while ( !feof(f) ) + { + ch = RTF_GetChar( f ); + if ( ch == '\\' ) + { + metadata++; + } + if ( metadata == 2 ) // the second metadata is the charset used + { + if ( ch != '\\' ) + { + code[i] = ch; + i++; + } + } + if ( metadata > 2 ) + { + code[i] = '\0'; + break; + } + } + + + for ( i = 0; i < CHARSET_NUMBER ; i++) + { + if ( strcmp( (const char *)charset[i], (const char *) code ) == 0 ) + { + charset_table = charset_variablename[i]; + break; + }; + } + if ( i == CHARSET_NUMBER ) + { + charset_table = charset_variablename[CHARSET_DEFAULT]; + } + + return 1; // always true! +} + +/**************************************/ + +int main(int argc,char** argv) +{ + int rv = 0; + FILE *f = NULL; + + Initialize(); + + if ( argc > 1) + { + if( strcmp(argv[1],"--help")==0 || strcmp(argv[1],"-H")==0 ) + { + printf("Use: %s [rtf_filename]\n",argv[0]); + rv = 0; + } else if ( strcmp(argv[1],"--version")==0 || strcmp(argv[1],"-V")==0 ) { + printf("rtf2html version 1.2\n"); + rv = 0; + } + else + { + rv = openfile(argv[1], &f); + if ( rv ) rv = RTF_FindCharset(f); + if ( rv ) + { + rewind(f); + rv = RTF_Parse(f); + } + if ( rv ) rv = closefile(f); + } + } + else + { + printf("Use: %s [rtf_filename]\n",argv[0]); + } + return rv; +} |