Source code for konlp.tokenize.api
# Copyright (C) 2017 - 0000 KoNLTK project
#
# Korean Natural Language Toolkit:
#
#
# Author: HyunYoung Lee <hyun02.engineer@gmail.com>
# GyuHyeon Nam <ngh3053@gmail.com>
# Seungshik Kang <sskang@kookmin.ac.kr>
# URL: <https://www.konltk.org>
# For license information, see LICENSE.TXT
# ============================================================
"""Korean Natural Language Toolkit tonkenizer interface"""
from abc import ABCMeta, abstractmethod
from six import add_metaclass
[docs]@add_metaclass(ABCMeta)
class TokenizerI(object):
"""Tokenizer Interface"""
[docs] @abstractmethod
def tokenize(self, string):
"""Return a tokenized copy of string.
Args:
string (str): String to tokenize
Returns:
list(str): Tokenized tokens
Raises:
NotImplementedError: If not implement this method on a class that extends this class
"""
raise NotImplementedError()
[docs]class SimpleTokenizer(TokenizerI):
"""For an example about how to inherit the class above"""
[docs] def tokenize(self, string):
"""Simple string tokenizer by white-space character
Args:
string (str): String to tokenize
Returns:
str: Tokenized tokens
"""
return string.split()