resources

sighan2005

The Second International Chinese Word Segmentation Bakeoff took place over the summer of 2005.

pku

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_DEV = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training_10.txt'

Dev set (last 10% of full official training set).

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_DICT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/pku_training_words.utf8'

Dictionary built on trainings set.

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_TEST = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/pku_test_gold.utf8'

Test set.

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_TEST_INPUT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/pku_test.utf8'

Test input.

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_TRAIN = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training_90.txt'

Training set (first 90% of the full official training set).

hanlp.datasets.tokenization.sighan2005.pku.SIGHAN2005_PKU_TRAIN_ALL = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training.utf8'

Full training set.

msr

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_DEV = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training_10.txt'

Dev set (last 10% of full official training set).

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_DICT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/msr_training_words.utf8'

Dictionary built on trainings set.

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_TEST = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/msr_test_gold.utf8'

Test set.

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_TEST_INPUT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/msr_test.utf8'

Test input.

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_TRAIN = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training_90.txt'

Training set (first 90% of the full official training set).

hanlp.datasets.tokenization.sighan2005.msr.SIGHAN2005_MSR_TRAIN_ALL = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training.utf8'

Full training set.

as

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_DEV = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training_10.txt'

Dev set (last 10% of full official training set).

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_DICT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/as_training_words.utf8'

Dictionary built on trainings set.

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_TEST = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/as_testing_gold.utf8'

Test set.

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_TEST_INPUT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/as_testing.utf8'

Test input.

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_TRAIN = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training_90.txt'

Training set (first 90% of the full official training set).

hanlp.datasets.tokenization.sighan2005.as_.SIGHAN2005_AS_TRAIN_ALL = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training.utf8'

Full training set.

cityu

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_DEV = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training_10.txt'

Dev set (last 10% of full official training set).

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_DICT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/cityu_training_words.utf8'

Dictionary built on trainings set.

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_TEST = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/cityu_test_gold.utf8'

Test set.

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_TEST_INPUT = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/cityu_test.utf8'

Test input.

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_TRAIN = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training_90.txt'

Training set (first 90% of the full official training set).

hanlp.datasets.tokenization.sighan2005.cityu.SIGHAN2005_CITYU_TRAIN_ALL = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training.utf8'

Full training set.

CTB6

hanlp.datasets.tokenization.ctb6.CTB6_CWS_DEV = 'http://file.hankcs.com/corpus/ctb6_cws.zip#dev.txt'

CTB6 dev set.

hanlp.datasets.tokenization.ctb6.CTB6_CWS_TEST = 'http://file.hankcs.com/corpus/ctb6_cws.zip#test.txt'

CTB6 test set.

hanlp.datasets.tokenization.ctb6.CTB6_CWS_TRAIN = 'http://file.hankcs.com/corpus/ctb6_cws.zip#train.txt'

CTB6 training set.

CTB8

hanlp.datasets.parsing.ctb8.CTB8_CWS_TRAIN = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/train.txt'

Training set for ctb8 Chinese word segmentation.

hanlp.datasets.parsing.ctb8.CTB8_CWS_DEV = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/dev.txt'

Dev set for ctb8 Chinese word segmentation.

hanlp.datasets.parsing.ctb8.CTB8_CWS_TEST = 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/test.txt'

Test set for ctb8 Chinese word segmentation.

CTB9

hanlp.datasets.parsing.ctb9.CTB9_CWS_TRAIN = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/train.txt'

Training set for ctb9 Chinese word segmentation.

hanlp.datasets.parsing.ctb9.CTB9_CWS_DEV = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/dev.txt'

Dev set for ctb9 Chinese word segmentation.

hanlp.datasets.parsing.ctb9.CTB9_CWS_TEST = 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/test.txt'

Test set for ctb9 Chinese word segmentation.