from __future__ import absolute_importfrom __future__ import divisionfrom __future__ import print_functionfrom collections import OrderedDictfrom sklearn.metrics import accuracy_scoreimport tensorflow as tffrom .evaluator import Evaluatorclass ProbesEvaluator(Evaluator):    def __init__(self, n_layers=12):        # declare metric names this evaluator will return        metric_names = ['accuracy_layer_%d' % i for i in range(n_layers + 1)]        self.n_layers = n_layers        self.metric_names = metric_names        # pass metric names to base class        super(ProbesEvaluator, self).__init__(metric_names)    def clear(self):        '''        clear internal storage        '''        self.predictions = [[] for _ in range(self.n_layers + 1)]        self.labels = []    def add_batch_info(self, predictions, labels):        '''        store prediction and labels in a internal list        Args:          predictions batched prediction result, numpy array with shape N          labels batched labels, numpy array with shape N        '''        for i, layer_predictions in enumerate(predictions):            for pred in layer_predictions:                self.predictions[i].append(pred)        for label in labels:            self.labels.append(label)    def evaluate(self, labels):        '''        python evaluation code which will be run after        all test batched data are predicted        '''        if len(self.predictions) == 0 or len(self.labels) == 0:            tf.logging.info('empty data to evaluate')            return {key: 0.0 for key in self.metric_names}        ret_dict = OrderedDict()        for i in range(self.n_layers + 1):            accuracy = accuracy_score(self.labels, self.predictions[i])            ret_dict['accuracy_layer_%d' % i] = accuracy        return ret_dictdef teacher_probes_eval_metrics(logits, labels, num_labels):    """ Building evaluation metrics while evaluating    Args:        logits (`Tensor`): list of tensors shape of [None, num_labels]        labels (`Tensor`): shape of [None]    Returns:        ret_dict (`dict`): A dict of each layer accuracy tf.metrics op    """    predictions_list = [        tf.argmax(layer_logits, axis=-1, output_type=tf.int32) for layer_logits in logits]    info_dict = {        "predictions": predictions_list,        "labels": labels,    }    evaluator = ProbesEvaluator(n_layers=len(logits) - 1)    label_ids = [i for i in range(num_labels)]    metric_dict = evaluator.get_metric_ops(info_dict, label_ids)    ret_metrics = evaluator.evaluate(label_ids)    for i in range(len(logits)):        tf.summary.scalar("eval_accuracy_layer_%d" % i, ret_metrics['accuracy_layer_%d' % i] )    return metric_dict