summaryrefslogtreecommitdiffstats
path: root/indexlib/tests/tokenizer-test.cpp
blob: 1354ddcd6465cf70b6cfd5c864d3715a31c462e3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#include <boost/test/unit_test.hpp>
#include "tokenizer.h"
#include <cassert>

using namespace ::boost::unit_test;
namespace indexlib { namespace tests { namespace tokenizer_test {

using indexlib::detail::tokenizer;
using indexlib::detail::get_tokenizer;

void simple() {
	std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
	assert(tokenizer);
	std::vector<std::string> tokens = tokenizer->string_to_words( "one     ,as, ''#`:ThReE,  �����" );
	std::vector<std::string> expected;
	expected.push_back( "ONE" );
	expected.push_back( "AS" );
	expected.push_back( "THREE" );
	expected.push_back( "AAACE" );
	std::sort( tokens.begin(), tokens.end() );
	std::sort( expected.begin(), expected.end() );
	BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
	for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
		BOOST_CHECK_EQUAL( expected[ i ], tokens[ i ] );
	}
}

void with_newlines() {
	std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
	assert(tokenizer);
	std::vector<std::string> tokens = tokenizer->string_to_words( "one\ntwo\nthree" );
	std::vector<std::string> expected;
	expected.push_back( "ONE" );
	expected.push_back( "TWO" );
	expected.push_back( "THREE" );
	std::sort( tokens.begin(), tokens.end() );
	std::sort( expected.begin(), expected.end() );
	BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
	for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
		BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
	}
}

void with_numbers() {
	std::unique_ptr<tokenizer> tokenizer = get_tokenizer( "latin-1:european" );
	assert(tokenizer);
	std::vector<std::string> tokens = tokenizer->string_to_words( "one 012 123 four" );
	std::vector<std::string> expected;
	expected.push_back( "ONE" );
	expected.push_back( "012" );
	expected.push_back( "123" );
	expected.push_back( "FOUR" );
	std::sort( tokens.begin(), tokens.end() );
	std::sort( expected.begin(), expected.end() );
	BOOST_CHECK_EQUAL( expected.size(), tokens.size() );
	for ( int i = 0; i < expected.size() && i < tokens.size(); ++i ) {
		BOOST_CHECK_EQUAL( expected.at( i ), tokens.at( i ) );
	}
}

test_suite* get_suite() {
	test_suite* test = BOOST_TEST_SUITE( "Tokenizer tests" );
	test->add( BOOST_TEST_CASE( &simple ) );
	test->add( BOOST_TEST_CASE( &with_newlines ) );
	test->add( BOOST_TEST_CASE( &with_numbers ) );
	return test;
}

}}} //namespaces