summaryrefslogtreecommitdiffstats
path: root/tdespell2/plugins/ispell/ispell_checker.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'tdespell2/plugins/ispell/ispell_checker.cpp')
-rw-r--r--tdespell2/plugins/ispell/ispell_checker.cpp505
1 files changed, 505 insertions, 0 deletions
diff --git a/tdespell2/plugins/ispell/ispell_checker.cpp b/tdespell2/plugins/ispell/ispell_checker.cpp
new file mode 100644
index 000000000..c07d9a55f
--- /dev/null
+++ b/tdespell2/plugins/ispell/ispell_checker.cpp
@@ -0,0 +1,505 @@
+/* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/* tdespell2 - adopted from Enchant
+ * Copyright (C) 2003 Dom Lachowicz
+ * Copyright (C) 2004 Zack Rusin <[email protected]>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * In addition, as a special exception, Dom Lachowicz
+ * gives permission to link the code of this program with
+ * non-LGPL Spelling Provider libraries (eg: a MSFT Office
+ * spell checker backend) and distribute linked combinations including
+ * the two. You must obey the GNU Lesser General Public License in all
+ * respects for all of the code used other than said providers. If you modify
+ * this file, you may extend this exception to your version of the
+ * file, but you are not obligated to do so. If you do not wish to
+ * do so, delete this exception statement from your version.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
+#include "sp_spell.h"
+#include "ispell_checker.h"
+
+#include <tqmap.h>
+#include <tqdir.h>
+#include <tqfileinfo.h>
+
+/***************************************************************************/
+
+typedef struct str_ispell_map
+{
+ const char * lang;
+ const char * dict;
+ const char * enc;
+} IspellMap;
+
+static const char *ispell_dirs [] = {
+ "/usr/lib/ispell",
+ "/usr/local/lib/ispell",
+ "/usr/local/share/ispell",
+ "/usr/share/ispell",
+ "/usr/pkg/lib",
+ 0
+};
+static const IspellMap ispell_map [] = {
+ {"ca" ,"catala.hash" ,"iso-8859-1" },
+ {"ca_ES" ,"catala.hash" ,"iso-8859-1" },
+ {"cs" ,"czech.hash" ,"iso-8859-2" },
+ {"cs_CZ" ,"czech.hash" ,"iso-8859-2" },
+ {"da" ,"dansk.hash" ,"iso-8859-1" },
+ {"da_DK" ,"dansk.hash" ,"iso-8859-1" },
+ {"de" ,"deutsch.hash" ,"iso-8859-1" },
+ {"de_CH" ,"swiss.hash" ,"iso-8859-1" },
+ {"de_AT" ,"deutsch.hash" ,"iso-8859-1" },
+ {"de_DE" ,"deutsch.hash" ,"iso-8859-1" },
+ {"el" ,"ellhnika.hash" ,"iso-8859-7" },
+ {"el_GR" ,"ellhnika.hash" ,"iso-8859-7" },
+ {"en" ,"british.hash" ,"iso-8859-1" },
+ {"en_AU" ,"british.hash" ,"iso-8859-1" },
+ {"en_BZ" ,"british.hash" ,"iso-8859-1" },
+ {"en_CA" ,"british.hash" ,"iso-8859-1" },
+ {"en_GB" ,"british.hash" ,"iso-8859-1" },
+ {"en_IE" ,"british.hash" ,"iso-8859-1" },
+ {"en_JM" ,"british.hash" ,"iso-8859-1" },
+ {"en_NZ" ,"british.hash" ,"iso-8859-1" },
+ {"en_TT" ,"british.hash" ,"iso-8859-1" },
+ {"en_ZA" ,"british.hash" ,"iso-8859-1" },
+ {"en_ZW" ,"british.hash" ,"iso-8859-1" },
+ {"en_PH" ,"american.hash" ,"iso-8859-1" },
+ {"en_US" ,"american.hash" ,"iso-8859-1" },
+ {"eo" ,"esperanto.hash" ,"iso-8859-3" },
+ {"es" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_AR" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_BO" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_CL" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_CO" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_CR" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_DO" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_EC" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_ES" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_GT" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_HN" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_MX" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_NI" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_PA" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_PE" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_PR" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_PY" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_SV" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_UY" ,"espanol.hash" ,"iso-8859-1" },
+ {"es_VE" ,"espanol.hash" ,"iso-8859-1" },
+ {"fi" ,"finnish.hash" ,"iso-8859-1" },
+ {"fi_FI" ,"finnish.hash" ,"iso-8859-1" },
+ {"fr" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_BE" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_CA" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_CH" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_FR" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_LU" ,"francais.hash" ,"iso-8859-1" },
+ {"fr_MC" ,"francais.hash" ,"iso-8859-1" },
+ {"hu" ,"hungarian.hash" ,"iso-8859-2" },
+ {"hu_HU" ,"hungarian.hash" ,"iso-8859-2" },
+ {"ga" ,"irish.hash" ,"iso-8859-1" },
+ {"ga_IE" ,"irish.hash" ,"iso-8859-1" },
+ {"gl" ,"galician.hash" ,"iso-8859-1" },
+ {"gl_ES" ,"galician.hash" ,"iso-8859-1" },
+ {"ia" ,"interlingua.hash" ,"iso-8859-1" },
+ {"it" ,"italian.hash" ,"iso-8859-1" },
+ {"it_IT" ,"italian.hash" ,"iso-8859-1" },
+ {"it_CH" ,"italian.hash" ,"iso-8859-1" },
+ {"la" ,"mlatin.hash" ,"iso-8859-1" },
+ {"la_IT" ,"mlatin.hash" ,"iso-8859-1" },
+ {"lt" ,"lietuviu.hash" ,"iso-8859-13" },
+ {"lt_LT" ,"lietuviu.hash" ,"iso-8859-13" },
+ {"nl" ,"nederlands.hash" ,"iso-8859-1" },
+ {"nl_NL" ,"nederlands.hash" ,"iso-8859-1" },
+ {"nl_BE" ,"nederlands.hash" ,"iso-8859-1" },
+ {"nb" ,"norsk.hash" ,"iso-8859-1" },
+ {"nb_NO" ,"norsk.hash" ,"iso-8859-1" },
+ {"nn" ,"nynorsk.hash" ,"iso-8859-1" },
+ {"nn_NO" ,"nynorsk.hash" ,"iso-8859-1" },
+ {"no" ,"norsk.hash" ,"iso-8859-1" },
+ {"no_NO" ,"norsk.hash" ,"iso-8859-1" },
+ {"pl" ,"polish.hash" ,"iso-8859-2" },
+ {"pl_PL" ,"polish.hash" ,"iso-8859-2" },
+ {"pt" ,"brazilian.hash" ,"iso-8859-1" },
+ {"pt_BR" ,"brazilian.hash" ,"iso-8859-1" },
+ {"pt_PT" ,"portugues.hash" ,"iso-8859-1" },
+ {"ru" ,"russian.hash" ,"koi8-r" },
+ {"ru_MD" ,"russian.hash" ,"koi8-r" },
+ {"ru_RU" ,"russian.hash" ,"koi8-r" },
+ {"sc" ,"sardinian.hash" ,"iso-8859-1" },
+ {"sc_IT" ,"sardinian.hash" ,"iso-8859-1" },
+ {"sk" ,"slovak.hash" ,"iso-8859-2" },
+ {"sk_SK" ,"slovak.hash" ,"iso-8859-2" },
+ {"sl" ,"slovensko.hash" ,"iso-8859-2" },
+ {"sl_SI" ,"slovensko.hash" ,"iso-8859-2" },
+ {"sv" ,"svenska.hash" ,"iso-8859-1" },
+ {"sv_SE" ,"svenska.hash" ,"iso-8859-1" },
+ {"uk" ,"ukrainian.hash" ,"koi8-u" },
+ {"uk_UA" ,"ukrainian.hash" ,"koi8-u" },
+ {"yi" ,"yiddish-yivo.hash" ,"utf-8" }
+};
+
+static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
+static TQMap<TQString, TQString> ispell_dict_map;
+
+
+void
+ISpellChecker::try_autodetect_charset(const char * const inEncoding)
+{
+ if (inEncoding && strlen(inEncoding))
+ {
+ m_translate_in = TQTextCodec::codecForName(inEncoding);
+ }
+}
+
+/***************************************************************************/
+/***************************************************************************/
+
+ISpellChecker::ISpellChecker()
+ : deftflag(-1),
+ prefstringchar(-1),
+ m_bSuccessfulInit(false),
+ m_BC(NULL),
+ m_cd(NULL),
+ m_cl(NULL),
+ m_cm(NULL),
+ m_ho(NULL),
+ m_nd(NULL),
+ m_so(NULL),
+ m_se(NULL),
+ m_ti(NULL),
+ m_te(NULL),
+ m_hashstrings(NULL),
+ m_hashtbl(NULL),
+ m_pflaglist(NULL),
+ m_sflaglist(NULL),
+ m_chartypes(NULL),
+ m_infile(NULL),
+ m_outfile(NULL),
+ m_askfilename(NULL),
+ m_Trynum(0),
+ m_translate_in(0)
+{
+ memset(m_sflagindex,0,sizeof(m_sflagindex));
+ memset(m_pflagindex,0,sizeof(m_pflagindex));
+}
+
+#ifndef FREEP
+#define FREEP(p) do { if (p) free(p); } while (0)
+#endif
+
+ISpellChecker::~ISpellChecker()
+{
+ if (m_bSuccessfulInit) {
+ // only cleanup our mess if we were successfully initialized
+
+ clearindex (m_pflagindex);
+ clearindex (m_sflagindex);
+ }
+
+ FREEP(m_hashtbl);
+ FREEP(m_hashstrings);
+ FREEP(m_sflaglist);
+ FREEP(m_chartypes);
+
+ delete m_translate_in;
+ m_translate_in = 0;
+}
+
+bool
+ISpellChecker::checkWord( const TQString& utf8Word )
+{
+ ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
+ if (!m_bSuccessfulInit)
+ return false;
+
+ if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
+ return false;
+
+ bool retVal = false;
+ TQCString out;
+ if (!m_translate_in)
+ return false;
+ else {
+ /* convert to 8bit string and null terminate */
+ int len_out = utf8Word.length();
+
+ out = m_translate_in->fromUnicode( utf8Word, len_out );
+ }
+
+ if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
+ {
+ if (good(iWord, 0, 0, 1, 0) == 1 ||
+ compoundgood(iWord, 1) == 1)
+ {
+ retVal = true;
+ }
+ }
+
+ return retVal;
+}
+
+TQStringList
+ISpellChecker::suggestWord(const TQString& utf8Word)
+{
+ ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
+ int c;
+
+ if (!m_bSuccessfulInit)
+ return TQStringList();
+
+ if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
+ utf8Word.length() == 0)
+ return TQStringList();
+
+ TQCString out;
+ if (!m_translate_in)
+ return TQStringList();
+ else
+ {
+ /* convert to 8bit string and null terminate */
+
+ int len_out = utf8Word.length();
+ out = m_translate_in->fromUnicode( utf8Word, len_out );
+ }
+
+ if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
+ makepossibilities(iWord);
+ else
+ return TQStringList();
+
+ TQStringList sugg_arr;
+ for (c = 0; c < m_pcount; c++)
+ {
+ TQString utf8Word;
+
+ if (!m_translate_in)
+ {
+ /* copy to 8bit string and null terminate */
+ utf8Word = TQString::fromUtf8( m_possibilities[c] );
+ }
+ else
+ {
+ /* convert to 32bit string and null terminate */
+ utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
+ }
+
+ sugg_arr.append( utf8Word );
+ }
+
+ return sugg_arr;
+}
+
+static void
+s_buildHashNames (std::vector<std::string> & names, const char * dict)
+{
+ const char * tmp = 0;
+ int i = 0;
+
+ names.clear ();
+
+ while ( (tmp = ispell_dirs[i++]) ) {
+ TQCString maybeFile = TQCString( tmp ) + '/';
+ maybeFile += dict;
+ names.push_back( maybeFile.data() );
+ }
+}
+
+static void
+s_allDics()
+{
+ const char * tmp = 0;
+ int i = 0;
+
+ while ( (tmp = ispell_dirs[i++]) ) {
+ TQDir dir( tmp );
+ TQStringList lst = dir.entryList( "*.hash" );
+ for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
+ TQFileInfo info( *it );
+ for (size_t i = 0; i < size_ispell_map; i++)
+ {
+ const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
+ if (!strcmp (info.fileName().latin1(), mapping->dict))
+ {
+ ispell_dict_map.insert( mapping->lang, *it );
+ }
+ }
+ }
+ }
+}
+
+TQValueList<TQString>
+ISpellChecker::allDics()
+{
+ if ( ispell_dict_map.empty() )
+ s_allDics();
+
+ return ispell_dict_map.keys();
+}
+
+TQString
+ISpellChecker::loadDictionary (const char * szdict)
+{
+ std::vector<std::string> dict_names;
+
+ s_buildHashNames (dict_names, szdict);
+
+ for (size_t i = 0; i < dict_names.size(); i++)
+ {
+ if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
+ return dict_names[i].c_str();
+ }
+
+ return TQString::null;
+}
+
+/*!
+ * Load ispell dictionary hash file for given language.
+ *
+ * \param szLang - The language tag ("en-US") we want to use
+ * \return The name of the dictionary file
+ */
+bool
+ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
+{
+ TQString hashname;
+
+ const char * encoding = NULL;
+ const char * szFile = NULL;
+
+ for (size_t i = 0; i < size_ispell_map; i++)
+ {
+ const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
+ if (!strcmp (szLang, mapping->lang))
+ {
+ szFile = mapping->dict;
+ encoding = mapping->enc;
+ break;
+ }
+ }
+
+ if (!szFile || !strlen(szFile))
+ return false;
+
+ alloc_ispell_struct();
+
+ hashname = loadDictionary(szFile);
+ if (hashname.isEmpty())
+ return false;
+
+ // one of the two above calls succeeded
+ setDictionaryEncoding (hashname, encoding);
+
+ return true;
+}
+
+void
+ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding )
+{
+ /* Get Hash encoding from XML file. This should always work! */
+ try_autodetect_charset(encoding);
+
+ if (m_translate_in)
+ {
+ /* We still have to setup prefstringchar*/
+ prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
+ : static_cast<int *>(NULL));
+
+ if (prefstringchar < 0)
+ {
+ std::string teststring;
+ for(int n1 = 1; n1 <= 15; n1++)
+ {
+ teststring = "latin" + n1;
+ prefstringchar = findfiletype(teststring.c_str(), 1,
+ deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
+ if (prefstringchar >= 0)
+ break;
+ }
+ }
+
+ return; /* success */
+ }
+
+ /* Test for UTF-8 first */
+ prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
+ if (prefstringchar >= 0)
+ {
+ m_translate_in = TQTextCodec::codecForName("utf8");
+ }
+
+ if (m_translate_in)
+ return; /* success */
+
+ /* Test for "latinN" */
+ if (!m_translate_in)
+ {
+ /* Look for "altstringtype" names from latin1 to latin15 */
+ for(int n1 = 1; n1 <= 15; n1++)
+ {
+ TQString teststring = TQString("latin%1").arg(n1);
+ prefstringchar = findfiletype(teststring.latin1(), 1,
+ deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
+ if (prefstringchar >= 0)
+ {
+ //FIXME: latin1 might be wrong
+ m_translate_in = TQTextCodec::codecForName( teststring.latin1() );
+ break;
+ }
+ }
+ }
+
+ /* If nothing found, use latin1 */
+ if (!m_translate_in)
+ {
+ m_translate_in = TQTextCodec::codecForName("latin1");
+ }
+}
+
+bool
+ISpellChecker::requestDictionary(const char *szLang)
+{
+ if (!loadDictionaryForLanguage (szLang))
+ {
+ // handle a shortened version of the language tag: en_US => en
+ std::string shortened_dict (szLang);
+ size_t uscore_pos;
+
+ if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
+ shortened_dict = shortened_dict.substr(0, uscore_pos);
+ if (!loadDictionaryForLanguage (shortened_dict.c_str()))
+ return false;
+ } else
+ return false;
+ }
+
+ m_bSuccessfulInit = true;
+
+ if (prefstringchar < 0)
+ m_defdupchar = 0;
+ else
+ m_defdupchar = prefstringchar;
+
+ return true;
+}