diff options
author | tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2010-03-01 19:17:32 +0000 |
---|---|---|
committer | tpearson <tpearson@283d02a7-25f6-0310-bc7c-ecb5cbfe19da> | 2010-03-01 19:17:32 +0000 |
commit | e38d2351b83fa65c66ccde443777647ef5cb6cff (patch) | |
tree | 1897fc20e9f73a81c520a5b9f76f8ed042124883 /src/translators/btparse/string_util.c | |
download | tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.tar.gz tellico-e38d2351b83fa65c66ccde443777647ef5cb6cff.zip |
Added KDE3 version of Tellico
git-svn-id: svn://anonsvn.kde.org/home/kde/branches/trinity/applications/tellico@1097620 283d02a7-25f6-0310-bc7c-ecb5cbfe19da
Diffstat (limited to 'src/translators/btparse/string_util.c')
-rw-r--r-- | src/translators/btparse/string_util.c | 695 |
1 files changed, 695 insertions, 0 deletions
diff --git a/src/translators/btparse/string_util.c b/src/translators/btparse/string_util.c new file mode 100644 index 0000000..3713608 --- /dev/null +++ b/src/translators/btparse/string_util.c @@ -0,0 +1,695 @@ +/* ------------------------------------------------------------------------ +@NAME : string_util.c +@DESCRIPTION: Various string-processing utility functions: + bt_purify_string() + bt_change_case() + + and their helpers: + foreign_letter() + purify_special_char() +@GLOBALS : +@CALLS : +@CALLERS : +@CREATED : 1997/10/19, Greg Ward +@MODIFIED : 1997/11/25, GPW: renamed to from purify.c to string_util.c + added bt_change_case() and friends +@VERSION : $Id: string_util.c,v 1.10 1999/10/28 22:50:28 greg Rel $ +-------------------------------------------------------------------------- */ + +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <assert.h> +#include "error.h" +#include "btparse.h" +#include "bt_debug.h" + + +/* + * These definitions should be fixed to be consistent with HTML + * entities, just for fun. And perhaps I should add entries for + * accented letters (at least those supported by TeX and HTML). + */ +typedef enum +{ + L_OTHER, /* not a "foreign" letter */ + L_OSLASH_L, /* Eastern European {\o} */ + L_OSLASH_U, + L_LSLASH_L, /* {\l} */ + L_LSLASH_U, + L_OELIG_L, /* Latin {\oe} ligature */ + L_OELIG_U, + L_AELIG_L, /* {\ae} ligature */ + L_AELIG_U, + L_SSHARP_L, /* German "sharp s" {\ss} */ + L_SSHARP_U, + L_ACIRCLE_L, /* Nordic {\aa} */ + L_ACIRCLE_U, + L_INODOT_L, /* undotted i: {\i} */ + L_JNODOT_L /* {\j} */ +} bt_letter; + + +static const char * uc_version[] = +{ + NULL, /* L_OTHER */ + "\\O", /* L_OSLASH_L */ + "\\O", /* L_OSLASH_U */ + "\\L", /* L_LSLASH_L */ + "\\L", /* L_LSLASH_U */ + "\\OE", /* L_OELIG_L */ + "\\OE", /* L_OELIG_U */ + "\\AE", /* L_AELIG_L */ + "\\AE", /* L_AELIG_U */ + "SS", /* L_SSHARP_L -- for LaTeX 2.09 */ + "\\SS", /* L_SSHARP_U */ + "\\AA", /* L_ACIRCLE_L */ + "\\AA", /* L_ACIRCLE_U */ + "I", /* L_INODOT_L */ + "J" /* L_JNODOT_L */ +}; + +static const char * lc_version[] = +{ + NULL, /* L_OTHER */ + "\\o", /* L_OSLASH_L */ + "\\o", /* L_OSLASH_U */ + "\\l", /* L_LSLASH_L */ + "\\l", /* L_LSLASH_U */ + "\\oe", /* L_OELIG_L */ + "\\oe", /* L_OELIG_U */ + "\\ae", /* L_AELIG_L */ + "\\ae", /* L_AELIG_U */ + "\\ss", /* L_SSHARP_L */ + "\\ss", /* L_SSHARP_U */ + "\\aa", /* L_ACIRCLE_L */ + "\\aa", /* L_ACIRCLE_U */ + "\\i", /* L_INODOT_L */ + "\\j" /* L_JNODOT_L */ +}; + + + +/* ------------------------------------------------------------------------ +@NAME : foreign_letter() +@INPUT : str + start + stop +@OUTPUT : letter +@RETURNS : TRUE if the string delimited by start and stop is a foreign + letter control sequence +@DESCRIPTION: Determines if a character sequence is one of (La)TeX's + "foreign letter" control sequences (l, o, ae, oe, aa, ss, plus + uppercase versions). If `letter' is non-NULL, returns which + letter was found in it (as a bt_letter value). +@CALLS : +@CALLERS : purify_special_char() +@CREATED : 1997/10/19, GPW +@MODIFIED : +-------------------------------------------------------------------------- */ +static boolean +foreign_letter (char *str, int start, int stop, bt_letter * letter) +{ + char c1, c2; + bt_letter dummy; + + + /* + * This is written for speed, not flexibility -- adding new foreign + * letters would be trying and vexatious. + * + * N.B. my gold standard list of foreign letters is Kopka and Daly's + * *A Guide to LaTeX 2e*, section 2.5.6. + */ + + if (letter == NULL) /* so we can assign to *letter */ + letter = &dummy; /* without compunctions */ + *letter = L_OTHER; /* assume not a "foreign" letter */ + + c1 = str[start+0]; /* only two characters that we're */ + c2 = str[start+1]; /* interested in */ + + switch (stop - start) + { + case 1: /* one-character control sequences */ + switch (c1) /* (\o and \l) */ + { + case 'o': + *letter = L_OSLASH_L; return TRUE; + case 'O': + *letter = L_OSLASH_U; return TRUE; + case 'l': + *letter = L_LSLASH_L; return TRUE; + case 'L': + *letter = L_LSLASH_L; return TRUE; + case 'i': + *letter = L_INODOT_L; return TRUE; + case 'j': + *letter = L_JNODOT_L; return TRUE; + default: + return FALSE; + } + break; + case 2: /* two character control sequences */ + switch (c1) /* (\oe, \ae, \aa, and \ss) */ + { + case 'o': + if (c2 == 'e') { *letter = L_OELIG_L; return TRUE; } + case 'O': + if (c2 == 'E') { *letter = L_OELIG_U; return TRUE; } + + /* BibTeX 0.99 does not handle \aa and \AA -- but I do!*/ + case 'a': + if (c2 == 'e') + { *letter = L_AELIG_L; return TRUE; } + else if (c2 == 'a') + { *letter = L_ACIRCLE_L; return TRUE; } + else + return FALSE; + case 'A': + if (c2 == 'E') + { *letter = L_AELIG_U; return TRUE; } + else if (c2 == 'A') + { *letter = L_ACIRCLE_U; return TRUE; } + else + return FALSE; + + /* uppercase sharp-s -- new with LaTeX 2e (so far all I do + * is recognize it as a "foreign" letter) + */ + case 's': + if (c2 == 's') + { *letter = L_SSHARP_L; return TRUE; } + else + return FALSE; + case 'S': + if (c2 == 'S') + { *letter = L_SSHARP_U; return TRUE; } + else + return FALSE; + } + break; + default: + return FALSE; + } /* switch on length of control sequence */ + + internal_error ("foreign_letter(): should never reach end of function"); + return FALSE; /* to keep gcc -Wall happy */ + +} /* foreign_letter */ + + +/* ------------------------------------------------------------------------ +@NAME : purify_special_char() +@INPUT : *src, *dst - pointers into the input and output strings +@OUTPUT : *src - updated to point to the closing brace of the + special char + *dst - updated to point to the next available spot + for copying text to +@RETURNS : +@DESCRIPTION: "Purifies" a BibTeX special character. On input, *src should + point to the opening brace of a special character (ie. the + brace must be at depth 0 of the whole string, and the + character immediately following it must be a backslash). + *dst should point to the next spot to copy into the output + (purified) string. purify_special_char() will skip over the + opening brace and backslash; if the control sequence is one + of LaTeX's foreign letter sequences (as determined by + foreign_letter()), then it is simply copied to *dst. + Otherwise the control sequence is skipped. In either case, + text after the control sequence is either copied (alphabetic + characters) or skipped (anything else, including hyphens, + ties, and digits). +@CALLS : foreign_letter() +@CALLERS : bt_purify_string() +@CREATED : 1997/10/19, GPW +@MODIFIED : +-------------------------------------------------------------------------- */ +static void +purify_special_char (char *str, int * src, int * dst) +{ + int depth; + int peek; + + assert (str[*src] == '{' && str[*src + 1] == '\\'); + depth = 1; + + *src += 2; /* jump to start of control sequence */ + peek = *src; /* scan to end of control sequence */ + while (isalpha (str[peek])) + peek++; + if (peek == *src) /* in case of single-char, non-alpha */ + peek++; /* control sequence (eg. {\'e}) */ + + if (foreign_letter (str, *src, peek, NULL)) + { + assert (peek - *src == 1 || peek - *src == 2); + str[(*dst)++] = str[(*src)++]; /* copy first char */ + if (*src < peek) /* copy second char, downcasing */ + str[(*dst)++] = tolower (str[(*src)++]); + } + else /* not a foreign letter -- skip */ + { /* the control sequence entirely */ + *src = peek; + } + + while (str[*src]) + { + switch (str[*src]) + { + case '{': + depth++; + (*src)++; + break; + case '}': + depth--; + if (depth == 0) return; /* done with special char */ + (*src)++; + break; + default: + if (isalpha (str[*src])) /* copy alphabetic chars */ + str[(*dst)++] = str[(*src)++]; + else /* skip everything else */ + (*src)++; + } + } + + /* + * If we get here, we have unbalanced braces -- the '}' case should + * always hit a depth == 0 point if braces are balanced. No warning, + * though, because a) BibTeX doesn't warn about purifying unbalanced + * strings, and b) we (should have) already warned about it in the + * lexer. + */ + +} /* purify_special_char() */ + + +/* ------------------------------------------------------------------------ +@NAME : bt_purify_string() +@INOUT : instr +@INPUT : options +@OUTPUT : +@RETURNS : instr - same as input string, but modified in place +@DESCRIPTION: "Purifies" a BibTeX string. This consists of copying + alphanumeric characters, converting hyphens and ties to + space, copying spaces, and skipping everything else. (Well, + almost -- special characters are handled specially, of + course. Basically, accented letters have the control + sequence skipped, while foreign letters have the control + sequence preserved in a reasonable manner. See + purify_special_char() for details.) +@CALLS : purify_special_char() +@CALLERS : +@CREATED : 1997/10/19, GPW +@MODIFIED : +-------------------------------------------------------------------------- */ +void +bt_purify_string (char * string, ushort options) +{ + int src, /* both indeces into string */ + dst; + int depth; /* brace depth in string */ + unsigned orig_len; + + /* + * Since purification always copies or deletes chars, outstr will + * be no longer than string -- so nothing fancy is required to put + * an upper bound on its eventual size. + */ + + depth = 0; + src = 0; + dst = 0; + orig_len = strlen (string); + + DBG_ACTION (1, printf ("bt_purify_string(): input = %p (%s)\n", + string, string)); + + while (string[src] != (char) 0) + { + DBG_ACTION (2, printf (" next: >%c<: ", string[src])); + switch (string[src]) + { + case '~': /* "separator" characters -- */ + case '-': /* replaced with space */ + case ' ': /* and copy an actual space */ + string[dst++] = ' '; + src++; + DBG_ACTION (2, printf ("replacing with space")); + break; + case '{': + if (depth == 0 && string[src+1] == '\\') + { + DBG_ACTION (2, printf ("special char found")); + purify_special_char (string, &src, &dst); + } + else + { + DBG_ACTION (2, printf ("ordinary open brace")); + src++; + } + depth++; + break; + case '}': + DBG_ACTION (2, printf ("close brace")); + depth--; + src++; + break; + default: + if (isalnum (string[src])) /* any alphanumeric char -- */ + { + DBG_ACTION (2, printf ("alphanumeric -- copying")); + string[dst++] = string[src++]; /* copy it */ + } + else /* anything else -- skip it */ + { + DBG_ACTION (2, printf ("non-separator, non-brace, non-alpha")); + src++; + } + } /* switch string[src] */ + + DBG_ACTION (2, printf ("\n")); + + } /* while string[src] */ + + DBG_ACTION (1, printf ("bt_purify_string(): depth on exit: %d\n", depth)); + + string[dst] = (char) 0; + assert (strlen (string) <= orig_len); +} /* bt_purify_string() */ + + +/* ====================================================================== + * Case-transformation stuff + */ + + +/* ------------------------------------------------------------------------ +@NAME : convert_special_char() +@INPUT : transform +@INOUT : string + src + dst + start_sentence + after_colon +@RETURNS : +@DESCRIPTION: Does case conversion on a special character. +@GLOBALS : +@CALLS : +@CALLERS : +@CREATED : 1997/11/25, GPW +@MODIFIED : +-------------------------------------------------------------------------- */ +static void +convert_special_char (char transform, + char * string, + int * src, + int * dst, + boolean * start_sentence, + boolean * after_colon) +{ + int depth; + boolean done_special; + int cs_end; + int cs_len; /* counting the backslash */ + bt_letter letter; + const char * repl; + int repl_len; + +#ifndef ALLOW_WARNINGS + repl = NULL; /* silence "might be used" */ + /* uninitialized" warning */ +#endif + + /* First, copy just the opening brace */ + string[(*dst)++] = string[(*src)++]; + + /* + * Now loop over characters inside the braces -- stop when we reach + * the matching close brace, or when the string ends. + */ + depth = 1; /* because we're in a special char */ + done_special = FALSE; + + while (string[*src] != 0 && !done_special) + { + switch (string[*src]) + { + case '\\': /* a control sequence */ + { + cs_end = *src+1; /* scan over chars of c.s. */ + while (isalpha (string[cs_end])) + cs_end++; + + /* + * OK, now *src points to the backslash (so src+*1 points to + * first char. of control sequence), and cs_end points to + * character immediately following end of control sequence. + * Thus we analyze [*src+1..cs_end] to determine if the control + * sequence is a foreign letter, and use (cs_end - (*src+1) + 1) + * = (cs_end - *src) as the length of the control sequence. + */ + + cs_len = cs_end - *src; /* length of cs, counting backslash */ + + if (foreign_letter (string, *src+1, cs_end, &letter)) + { + if (letter == L_OTHER) + internal_error ("impossible foreign letter"); + + switch (transform) + { + case 'u': + repl = uc_version[(int) letter]; + break; + case 'l': + repl = lc_version[(int) letter]; + break; + case 't': + if (*start_sentence || *after_colon) + { + repl = uc_version[(int) letter]; + *start_sentence = *after_colon = FALSE; + } + else + { + repl = lc_version[(int) letter]; + } + break; + default: + internal_error ("impossible case transform \"%c\"", + transform); + } + + repl_len = strlen (repl); + if (repl_len > cs_len) + internal_error + ("replacement text longer than original cs"); + + strncpy (string + *dst, repl, repl_len); + *src = cs_end; + *dst += repl_len; + } /* control sequence is a foreign letter */ + else + { + /* not a foreign letter -- just copy the control seq. as is */ + + + strncpy (string + *dst, string + *src, cs_end - *src); + *src += cs_len; + assert (*src == cs_end); + *dst += cs_len; + } /* control sequence not a foreign letter */ + + break; + } /* case: '\\' */ + + case '{': + { + string[(*dst)++] = string[(*src)++]; + depth++; + break; + } + + case '}': + { + string[(*dst)++] = string[(*src)++]; + depth--; + if (depth == 0) + done_special = TRUE; + break; + } + + default: /* any other character */ + { + switch (transform) + { + /* + * Inside special chars, lowercase and title caps are same. + * (At least, that's bibtex's convention. I might change this + * at some point to be a bit smarter.) + */ + case 'l': + case 't': + string[(*dst)++] = tolower (string[(*src)++]); + break; + case 'u': + string[(*dst)++] = toupper (string[(*src)++]); + break; + default: + internal_error ("impossible case transform \"%c\"", + transform); + } + } /* default char */ + + } /* switch: current char */ + + } /* while: string or special char not done */ + +} /* convert_special_char() */ + + +/* ------------------------------------------------------------------------ +@NAME : bt_change_case() +@INPUT : +@OUTPUT : +@RETURNS : +@DESCRIPTION: Converts a string (in-place) to either uppercase, lowercase, + or "title capitalization"> +@GLOBALS : +@CALLS : +@CALLERS : +@CREATED : 1997/11/25, GPW +@MODIFIED : +-------------------------------------------------------------------------- */ +void +bt_change_case (char transform, + char * string, + ushort options) +{ + int len; + int depth; + int src, dst; /* indeces into string */ + boolean start_sentence; + boolean after_colon; + + src = dst = 0; + len = strlen (string); + depth = 0; + + start_sentence = TRUE; + after_colon = FALSE; + + while (string[src] != 0) + { + switch (string[src]) + { + case '{': + + /* + * At start of special character? The entire special char. + * will be handled here, as follows: + * - text at any brace-depth within the s.c. is case-mangled; + * punctuation (sentence endings, colons) are ignored + * - control sequences are left alone, unless they are + * one of the "foreign letter" control sequences, in + * which case they're converted to the appropriate string + * according to the uc_version or lc_version tables. + */ + if (depth == 0 && string[src+1] == '\\') + { + convert_special_char (transform, string, &src, &dst, + &start_sentence, &after_colon); + } + + /* + * Otherwise, it's just something in braces. This is probably + * a proper noun or something encased in braces to protect it + * from case-mangling, so we do not case-mangle it. However, + * we *do* switch out of start_sentence or after_colon mode if + * we happen to be there (otherwise we'll do the wrong thing + * once we're out of the braces). + */ + else + { + string[dst++] = string[src++]; + start_sentence = after_colon = FALSE; + depth++; + } + break; + + case '}': + string[dst++] = string[src++]; + depth--; + break; + + /* + * Sentence-ending punctuation and colons are handled separately + * to allow for exact mimicing of BibTeX's behaviour. I happen + * to think that this behaviour (capitalize first word of sentences + * in a title) is better than BibTeX's, but I want to keep my + * options open for a future goal of perfect compatability. + */ + case '.': + case '?': + case '!': + start_sentence = TRUE; + string[dst++] = string[src++]; + break; + + case ':': + after_colon = TRUE; + string[dst++] = string[src++]; + break; + + default: + if (isspace (string[src])) + { + string[dst++] = string[src++]; + } + else + { + if (depth == 0) + { + switch (transform) + { + case 'u': + string[dst++] = toupper (string[src++]); + break; + case 'l': + string[dst++] = tolower (string[src++]); + break; + case 't': + if (start_sentence || after_colon) + { + /* + * XXX BibTeX only preserves case of character + * immediately after a colon; I do two things + * differently: first, I pay attention to sentence + * punctuation, and second I force uppercase + * at start of sentence or after a colon. + */ + string[dst++] = toupper (string[src++]); + start_sentence = after_colon = FALSE; + } + else + { + string[dst++] = tolower (string[src++]); + } + break; + default: + internal_error ("impossible case transform \"%c\"", + transform); + } + } /* depth == 0 */ + else + { + string[dst++] = string[src++]; + } + } /* not blank */ + } /* switch on current character */ + + } /* while not at end of string */ + +} /* bt_change_case */ |