scikit-learn · adrinjalali · Jun 26, 2020 · May 25, 2020 · May 25, 2020 · May 25, 2020
diff --git a/sklearn/model_selection/_search.py b/sklearn/model_selection/_search.py
@@ -368,13 +368,12 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
     # NOTE we are not using the return value as the scorer by itself should be
     # validated before. We use check_scoring only to reject multimetric scorer
     check_scoring(estimator, scorer)
-    scores, n_samples_test = _fit_and_score(estimator, X, y,
-                                            scorer, train,
-                                            test, verbose, parameters,
-                                            fit_params=fit_params,
-                                            return_n_test_samples=True,
-                                            error_score=error_score)
-    return scores, parameters, n_samples_test
+    results = _fit_and_score(estimator, X, y, scorer, train,
+                             test, verbose, parameters,
+                             fit_params=fit_params,
+                             return_n_test_samples=True,
+                             error_score=error_score)
+    return results["test_scores"], parameters, results["n_test_samples"]
-    return results["test_scores"], parameters, results["n_test_samples"]
+    return results["test_scores"], results["parameters"], results["n_test_samples"]
-    return results["test_scores"], parameters, results["n_test_samples"]
+    return results["test_scores"], results["parameters"], results["n_test_samples"]
 
 
 def _check_param_grid(param_grid):
@@ -805,20 +804,7 @@ def evaluate_candidates(candidate_params):
 
     def _format_results(self, candidate_params, scorers, n_splits, out):
         n_candidates = len(candidate_params)
-
-        # if one choose to see train score, "out" will contain train score info
-        if self.return_train_score:
-            (train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
-             score_time) = zip(*out)
-        else:
-            (test_score_dicts, test_sample_counts, fit_time,
-             score_time) = zip(*out)
-
-        # test_score_dicts and train_score dicts are lists of dictionaries and
-        # we make them into dict of lists
-        test_scores = _aggregate_score_dicts(test_score_dicts)
-        if self.return_train_score:
-            train_scores = _aggregate_score_dicts(train_score_dicts)
+        out = _aggregate_score_dicts(out)
 
         results = {}
 
@@ -846,8 +832,8 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
                 results["rank_%s" % key_name] = np.asarray(
                     rankdata(-array_means, method='min'), dtype=np.int32)
 
-        _store('fit_time', fit_time)
-        _store('score_time', score_time)
+        _store('fit_time', out["fit_time"])
+        _store('score_time', out["score_time"])
         # Use one MaskedArray and mask all the places where the param is not
         # applicable for that candidate. Use defaultdict as each candidate may
         # not contain all the params
@@ -866,11 +852,11 @@ def _store(key_name, array, weights=None, splits=False, rank=False):
         # Store a list of param dicts at the key 'params'
         results['params'] = candidate_params
 
-        # NOTE test_sample counts (weights) remain the same for all candidates
-        test_sample_counts = np.array(test_sample_counts[:n_splits],
-                                      dtype=int)
+        test_scores = _aggregate_score_dicts(out["test_scores"])
+        if self.return_train_score:
+            train_scores = _aggregate_score_dicts(out["train_scores"])
 
-        for scorer_name in scorers.keys():
+        for scorer_name in test_scores:
             # Computed the (weighted) mean and std for test scores alone
             _store('test_%s' % scorer_name, test_scores[scorer_name],
                    splits=True, rank=True,

diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -239,35 +239,34 @@ def cross_validate(estimator, X, y=None, *, groups=None, scoring=None, cv=None,
     # independent, and that it is pickle-able.
     parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
                         pre_dispatch=pre_dispatch)
-    scores = parallel(
+    results = parallel(
         delayed(_fit_and_score)(
             clone(estimator), X, y, scorers, train, test, verbose, None,
             fit_params, return_train_score=return_train_score,
             return_times=True, return_estimator=return_estimator,
             error_score=error_score)
         for train, test in cv.split(X, y, groups))
 
-    zipped_scores = list(zip(*scores))
-    if return_train_score:
-        train_scores = zipped_scores.pop(0)
-        train_scores = _aggregate_score_dicts(train_scores)
+    results = _aggregate_score_dicts(results)
     if return_estimator:
-        fitted_estimators = zipped_scores.pop()
-    test_scores, fit_times, score_times = zipped_scores
-    test_scores = _aggregate_score_dicts(test_scores)
+        fitted_estimators = results["estimator"]
 
     ret = {}
-    ret['fit_time'] = np.array(fit_times)
-    ret['score_time'] = np.array(score_times)
+    ret['fit_time'] = results["fit_time"]
+    ret['score_time'] = results["score_time"]
 
     if return_estimator:
         ret['estimator'] = fitted_estimators
 
-    for name in scorers:
-        ret['test_%s' % name] = np.array(test_scores[name])
+    test_scores = _aggregate_score_dicts(results["test_scores"])
+    if return_train_score:
+        train_scores = _aggregate_score_dicts(results["train_scores"])
+
+    for name in test_scores:
+        ret['test_%s' % name] = test_scores[name]
         if return_train_score:
             key = 'train_%s' % name
-            ret[key] = np.array(train_scores[name])
+            ret[key] = train_scores[name]
 
     return ret
 
@@ -484,27 +483,22 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
 
     Returns
     -------
-    train_scores : dict of scorer name -> float
-        Score on training set (for all the scorers),
-        returned only if `return_train_score` is `True`.
-
-    test_scores : dict of scorer name -> float
-        Score on testing set (for all the scorers).
-
-    n_test_samples : int
-        Number of test samples.
-
-    fit_time : float
-        Time spent for fitting in seconds.
-
-    score_time : float
-        Time spent for scoring in seconds.
-
-    parameters : dict or None
-        The parameters that have been evaluated.
-
-    estimator : estimator object
-        The fitted estimator
+    result : dict with the following attributes
+        train_scores : dict of scorer name -> float
+            Score on training set (for all the scorers),
+            returned only if `return_train_score` is `True`.
+        test_scores : dict of scorer name -> float
+            Score on testing set (for all the scorers).
+        n_test_samples : int
+            Number of test samples.
+        fit_time : float
+            Time spent for fitting in seconds.
+        score_time : float
+            Time spent for scoring in seconds.
+        parameters : dict or None
+            The parameters that have been evaluated.
+        estimator : estimator object
+            The fitted estimator.
     """
     progress_msg = ""
     if verbose > 2:
@@ -529,7 +523,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     fit_params = fit_params if fit_params is not None else {}
     fit_params = _check_fit_params(X, fit_params, train)
 
-    train_scores = {}
     if parameters is not None:
         # clone after setting parameters in case any parameters
         # are estimators (like pipeline steps)
@@ -545,6 +538,7 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
     X_train, y_train = _safe_split(estimator, X, y, train)
     X_test, y_test = _safe_split(estimator, X, y, test, train)
 
+    result = {}
     try:
         if y_train is None:
             estimator.fit(X_train, **fit_params)
@@ -575,7 +569,6 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
             raise ValueError("error_score must be the string 'raise' or a"
                              " numeric value. (Hint: if using 'raise', please"
                              " make sure that it has been spelled correctly.)")
-
     else:
         fit_time = time.time() - start_time
         test_scores = _score(estimator, X_test, y_test, scorer)
@@ -602,17 +595,19 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
         end_msg += result_msg
         print(end_msg)
 
-    ret = [train_scores, test_scores] if return_train_score else [test_scores]
-
+    result["test_scores"] = test_scores
+    if return_train_score:
+        result["train_scores"] = train_scores
     if return_n_test_samples:
-        ret.append(_num_samples(X_test))
+        result["n_test_samples"] = _num_samples(X_test)
     if return_times:
-        ret.extend([fit_time, score_time])
+        result["fit_time"] = fit_time
+        result["score_time"] = score_time
     if return_parameters:
-        ret.append(parameters)
+        result["parameters"] = parameters
     if return_estimator:
-        ret.append(estimator)
-    return ret
+        result["estimator"] = estimator
+    return result
 
 
 def _score(estimator, X_test, y_test, scorer):
@@ -1296,23 +1291,27 @@ def learning_curve(estimator, X, y, *, groups=None,
         out = parallel(delayed(_incremental_fit_estimator)(
             clone(estimator), X, y, classes, train, test, train_sizes_abs,
             scorer, verbose, return_times) for train, test in cv_iter)
+        out = np.asarray(out).transpose((2, 1, 0))
     else:
         train_test_proportions = []
         for train, test in cv_iter:
             for n_train_samples in train_sizes_abs:
                 train_test_proportions.append((train[:n_train_samples], test))
 
-        out = parallel(delayed(_fit_and_score)(
+        results = parallel(delayed(_fit_and_score)(
             clone(estimator), X, y, scorer, train, test, verbose,
             parameters=None, fit_params=None, return_train_score=True,
             error_score=error_score, return_times=return_times)
             for train, test in train_test_proportions)
-        out = np.array(out)
-        n_cv_folds = out.shape[0] // n_unique_ticks
-        dim = 4 if return_times else 2
-        out = out.reshape(n_cv_folds, n_unique_ticks, dim)
+        results = _aggregate_score_dicts(results)
+        train_scores = results["train_scores"].reshape(-1, n_unique_ticks).T
+        test_scores = results["test_scores"].reshape(-1, n_unique_ticks).T
+        out = [train_scores, test_scores]
 
-    out = np.asarray(out).transpose((2, 1, 0))
+        if return_times:
+            fit_times = results["fit_time"].reshape(-1, n_unique_ticks).T
+            score_times = results["score_time"].reshape(-1, n_unique_ticks).T
+            out.extend([fit_times, score_times])
 
     ret = train_sizes_abs, out[0], out[1]
 
@@ -1522,18 +1521,19 @@ def validation_curve(estimator, X, y, *, param_name, param_range, groups=None,
 
     parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch,
                         verbose=verbose)
-    out = parallel(delayed(_fit_and_score)(
+    results = parallel(delayed(_fit_and_score)(
         clone(estimator), X, y, scorer, train, test, verbose,
         parameters={param_name: v}, fit_params=None, return_train_score=True,
         error_score=error_score)
         # NOTE do not change order of iteration to allow one time cv splitters
         for train, test in cv.split(X, y, groups) for v in param_range)
-    out = np.asarray(out)
     n_params = len(param_range)
-    n_cv_folds = out.shape[0] // n_params
-    out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0))
 
-    return out[0], out[1]
+    results = _aggregate_score_dicts(results)
+    train_scores = results["train_scores"].reshape(-1, n_params).T
+    test_scores = results["test_scores"].reshape(-1, n_params).T
+
+    return train_scores, test_scores
 
 
 def _aggregate_score_dicts(scores):

diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -1700,7 +1700,7 @@ def test_fit_and_score_working():
                             'return_parameters': True}
     result = _fit_and_score(*fit_and_score_args,
                             **fit_and_score_kwargs)
-    assert result[-1] == fit_and_score_kwargs['parameters']
+    assert result['parameters'] == fit_and_score_kwargs['parameters']
 
 
 def three_params_scorer(i, j, k):