resources¶
sighan2005¶
The Second International Chinese Word Segmentation Bakeoff took place over the summer of 2005.
pku¶
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_DEV
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training_10.txt'¶ Dev set (last 10% of full official training set).
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_DICT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/pku_training_words.utf8'¶ Dictionary built on trainings set.
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_TEST
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/pku_test_gold.utf8'¶ Test set.
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_TEST_INPUT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/pku_test.utf8'¶ Test input.
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_TRAIN
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training_90.txt'¶ Training set (first 90% of the full official training set).
-
hanlp.datasets.cws.sighan2005.pku.
SIGHAN2005_PKU_TRAIN_ALL
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/pku_training.utf8'¶ Full training set.
msr¶
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_DEV
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training_10.txt'¶ Dev set (last 10% of full official training set).
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_DICT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/msr_training_words.utf8'¶ Dictionary built on trainings set.
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_TEST
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/msr_test_gold.utf8'¶ Test set.
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_TEST_INPUT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/msr_test.utf8'¶ Test input.
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_TRAIN
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training_90.txt'¶ Training set (first 90% of the full official training set).
-
hanlp.datasets.cws.sighan2005.msr.
SIGHAN2005_MSR_TRAIN_ALL
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/msr_training.utf8'¶ Full training set.
as¶
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_DEV
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training_10.txt'¶ Dev set (last 10% of full official training set).
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_DICT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/as_training_words.utf8'¶ Dictionary built on trainings set.
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_TEST
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/as_testing_gold.utf8'¶ Test set.
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_TEST_INPUT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/as_testing.utf8'¶ Test input.
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_TRAIN
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training_90.txt'¶ Training set (first 90% of the full official training set).
-
hanlp.datasets.cws.sighan2005.as_.
SIGHAN2005_AS_TRAIN_ALL
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/as_training.utf8'¶ Full training set.
cityu¶
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_DEV
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training_10.txt'¶ Dev set (last 10% of full official training set).
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_DICT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/cityu_training_words.utf8'¶ Dictionary built on trainings set.
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_TEST
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#gold/cityu_test_gold.utf8'¶ Test set.
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_TEST_INPUT
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#testing/cityu_test.utf8'¶ Test input.
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_TRAIN
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training_90.txt'¶ Training set (first 90% of the full official training set).
-
hanlp.datasets.cws.sighan2005.cityu.
SIGHAN2005_CITYU_TRAIN_ALL
= 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip#training/cityu_training.utf8'¶ Full training set.
CTB6¶
-
hanlp.datasets.cws.ctb6.
CTB6_CWS_DEV
= 'http://file.hankcs.com/corpus/ctb6_cws.zip#dev.txt'¶ CTB6 dev set.
-
hanlp.datasets.cws.ctb6.
CTB6_CWS_TEST
= 'http://file.hankcs.com/corpus/ctb6_cws.zip#test.txt'¶ CTB6 test set.
-
hanlp.datasets.cws.ctb6.
CTB6_CWS_TRAIN
= 'http://file.hankcs.com/corpus/ctb6_cws.zip#train.txt'¶ CTB6 training set.
CTB8¶
-
hanlp.datasets.parsing.ctb8.
CTB8_CWS_TRAIN
= 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/train.txt'¶ Training set for ctb8 Chinese word segmentation.
-
hanlp.datasets.parsing.ctb8.
CTB8_CWS_DEV
= 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/dev.txt'¶ Dev set for ctb8 Chinese word segmentation.
-
hanlp.datasets.parsing.ctb8.
CTB8_CWS_TEST
= 'https://wakespace.lib.wfu.edu/bitstream/handle/10339/39379/LDC2013T21.tgz#data/tasks/cws/test.txt'¶ Test set for ctb8 Chinese word segmentation.
CTB9¶
-
hanlp.datasets.parsing.ctb9.
CTB9_CWS_TRAIN
= 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/train.txt'¶ Training set for ctb9 Chinese word segmentation.
-
hanlp.datasets.parsing.ctb9.
CTB9_CWS_DEV
= 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/dev.txt'¶ Dev set for ctb9 Chinese word segmentation.
-
hanlp.datasets.parsing.ctb9.
CTB9_CWS_TEST
= 'https://catalog.ldc.upenn.edu/LDC2016T13/ctb9.0_LDC2016T13.tgz#data/tasks/cws/test.txt'¶ Test set for ctb9 Chinese word segmentation.