import loggingimport osimport numpy as npimport torchfrom fairseq.data import (    data_utils,    Dictionary,    encoders,    ConcatDataset,    IdDataset,    MaskTokensDataset,    NestedDictionaryDataset,    NumelDataset,    NumSamplesDataset,    PadDataset,    PrependTokenDataset,    RawLabelDataset,    ResamplingDataset,    SortDataset,    TokenBlockDataset,)from fairseq.tasks import FairseqTask, register_tasklogger = logging.getLogger(__name__)@register_task('multilingual_masked_lm')class MultiLingualMaskedLMTask(FairseqTask):    """Task for training masked language models (e.g., BERT, RoBERTa)."""    @staticmethod    def add_args(parser):        """Add task-specific arguments to the parser."""        parser.add_argument('data', help='colon separated path to data directories list, \                            will be iterated upon during epochs in round-robin manner')        parser.add_argument('--sample-break-mode', default='complete',                            choices=['none', 'complete', 'complete_doc', 'eos'],                            help='If omitted or "none", fills each sample with tokens-per-sample '                                 'tokens. If set to "complete", splits samples only at the end '                                 'of sentence, but may include multiple sentences per sample. '                                 '"complete_doc" is similar but respects doc boundaries. '                                 'If set to "eos", includes only one sentence per sample.')        parser.add_argument('--tokens-per-sample', default=512, type=int,                            help='max number of total tokens over all segments '                                 'per sample for BERT dataset')        parser.add_argument('--mask-prob', default=0.15, type=float,                            help='probability of replacing a token with mask')        parser.add_argument('--leave-unmasked-prob', default=0.1, type=float,                            help='probability that a masked token is unmasked')        parser.add_argument('--random-token-prob', default=0.1, type=float,                            help='probability of replacing a token with a random token')        parser.add_argument('--freq-weighted-replacement', action='store_true',                            help='sample random replacement words based on word frequencies')        parser.add_argument('--mask-whole-words', default=False, action='store_true',                            help='mask whole words; you may also want to set --bpe')        parser.add_argument('--multilang-sampling-alpha', type=float, default=1.0,                            help='smoothing alpha for sample rations across multiple datasets')    def __init__(self, args, dictionary):        super().__init__(args)        self.dictionary = dictionary        self.seed = args.seed        # add mask token        self.mask_idx = dictionary.add_symbol('<mask>')    @classmethod    def setup_task(cls, args, **kwargs):        paths = args.data.split(os.pathsep)        assert len(paths) > 0        dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt'))        logger.info('dictionary: {} types'.format(len(dictionary)))        return cls(args, dictionary)    def _get_whole_word_mask(self):        # create masked input and targets        if self.args.mask_whole_words:            bpe = encoders.build_bpe(self.args)            if bpe is not None:                def is_beginning_of_word(i):                    if i < self.source_dictionary.nspecial:                        # special elements are always considered beginnings                        return True                    tok = self.source_dictionary[i]                    if tok.startswith('madeupword'):                        return True                    try:                        return bpe.is_beginning_of_word(tok)                    except ValueError:                        return True                mask_whole_words = torch.ByteTensor(list(                    map(is_beginning_of_word, range(len(self.source_dictionary)))                ))        else:            mask_whole_words = None        return mask_whole_words    def _get_sample_prob(self, dataset_lens):        """        Get smoothed sampling porbability by languages. This helps low resource        languages by upsampling them.        """        prob = dataset_lens / dataset_lens.sum()        smoothed_prob = prob ** self.args.multilang_sampling_alpha        smoothed_prob = smoothed_prob / smoothed_prob.sum()        return smoothed_prob    def load_dataset(self, split, epoch=0, combine=False, **kwargs):        """Load a given dataset split.        Args:            split (str): name of the split (e.g., train, valid, test)        """        paths = self.args.data.split(os.pathsep)        assert len(paths) > 0        data_path = paths[epoch % len(paths)]        languages = sorted(            name for name in os.listdir(data_path)            if os.path.isdir(os.path.join(data_path, name))        )        logger.info("Training on {0} languages: {1}".format(len(languages), languages))        logger.info("Language to id mapping: ", {                lang: id for id, lang in enumerate(languages)            }        )        mask_whole_words = self._get_whole_word_mask()        lang_datasets = []        for lang_id, language in enumerate(languages):            split_path = os.path.join(data_path, language, split)            dataset = data_utils.load_indexed_dataset(                split_path,                self.source_dictionary,                self.args.dataset_impl,                combine=combine,            )            if dataset is None:                raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path))            # create continuous blocks of tokens            dataset = TokenBlockDataset(                dataset,                dataset.sizes,                self.args.tokens_per_sample - 1,  # one less for <s>                pad=self.source_dictionary.pad(),                eos=self.source_dictionary.eos(),                break_mode=self.args.sample_break_mode,            )            logger.info('loaded {} blocks from: {}'.format(len(dataset), split_path))            # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)            dataset = PrependTokenDataset(dataset, self.source_dictionary.bos())            src_dataset, tgt_dataset = MaskTokensDataset.apply_mask(                dataset,                self.source_dictionary,                pad_idx=self.source_dictionary.pad(),                mask_idx=self.mask_idx,                seed=self.args.seed,                mask_prob=self.args.mask_prob,                leave_unmasked_prob=self.args.leave_unmasked_prob,                random_token_prob=self.args.random_token_prob,                freq_weighted_replacement=self.args.freq_weighted_replacement,                mask_whole_words=mask_whole_words,            )            lang_dataset = NestedDictionaryDataset(                {                    'net_input': {                        'src_tokens': PadDataset(                            src_dataset,                            pad_idx=self.source_dictionary.pad(),                            left_pad=False,                        ),                        'src_lengths': NumelDataset(src_dataset, reduce=False),                    },                    'target': PadDataset(                        tgt_dataset,                        pad_idx=self.source_dictionary.pad(),                        left_pad=False,                    ),                    'nsentences': NumSamplesDataset(),                    'ntokens': NumelDataset(src_dataset, reduce=True),                    'lang_id': RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]),                },                sizes=[src_dataset.sizes],            )            lang_datasets.append(lang_dataset)        dataset_lengths = np.array(            [len(d) for d in lang_datasets],            dtype=float,        )        logger.info(            'loaded total {} blocks for all languages'.format(                dataset_lengths.sum(),            )        )        if split == self.args.train_subset:            # For train subset, additionally up or down sample languages.            sample_probs = self._get_sample_prob(dataset_lengths)            logger.info("Sample probability by language: ", {                    lang: "{0:.4f}".format(sample_probs[id])                    for id, lang in enumerate(languages)                }            )            size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths            logger.info("Up/Down Sampling ratio by language: ", {                    lang: "{0:.2f}".format(size_ratio[id])                    for id, lang in enumerate(languages)                }            )            resampled_lang_datasets = [                ResamplingDataset(                    lang_datasets[i],                    size_ratio=size_ratio[i],                    seed=self.args.seed,                    epoch=epoch,                    replace=size_ratio[i] >= 1.0,                )                for i, d in enumerate(lang_datasets)            ]            dataset = ConcatDataset(resampled_lang_datasets)        else:            dataset = ConcatDataset(lang_datasets)            lang_splits = [split]            for lang_id, lang_dataset in enumerate(lang_datasets):                split_name = split + '_' + languages[lang_id]                lang_splits.append(split_name)                self.datasets[split_name] = lang_dataset            # [TODO]: This is hacky for now to print validation ppl for each            # language individually. Maybe need task API changes to allow it            # in more generic ways.            if split in self.args.valid_subset:                self.args.valid_subset = self.args.valid_subset.replace(                    split, ','.join(lang_splits)                )        with data_utils.numpy_seed(self.args.seed + epoch):            shuffle = np.random.permutation(len(dataset))        self.datasets[split] = SortDataset(            dataset,            sort_order=[                shuffle,                dataset.sizes,            ],        )    def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True):        src_dataset = PadDataset(            TokenBlockDataset(                src_tokens,                src_lengths,                self.args.tokens_per_sample - 1,  # one less for <s>                pad=self.source_dictionary.pad(),                eos=self.source_dictionary.eos(),                break_mode='eos',            ),            pad_idx=self.source_dictionary.pad(),            left_pad=False,        )        src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos())        src_dataset = NestedDictionaryDataset(            {                'id': IdDataset(),                'net_input': {                    'src_tokens': src_dataset,                    'src_lengths': NumelDataset(src_dataset, reduce=False),                },            },            sizes=src_lengths,        )        if sort:            src_dataset = SortDataset(src_dataset, sort_order=[src_lengths])        return src_dataset    def get_batch_iterator(        self, dataset, max_tokens=None, max_sentences=None, max_positions=None,        ignore_invalid_inputs=False, required_batch_size_multiple=1,        seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0,    ):        # Recreate epoch iterator every epoch cause the underlying        # datasets are dynamic due to sampling.        self.dataset_to_epoch_iter = {}        epoch_iter = super().get_batch_iterator(            dataset, max_tokens, max_sentences, max_positions,            ignore_invalid_inputs, required_batch_size_multiple,            seed, num_shards, shard_id, num_workers, epoch,        )        self.dataset_to_epoch_iter = {}        return epoch_iter    @property    def source_dictionary(self):        return self.dictionary    @property    def target_dictionary(self):        return self.dictionary