I am doing some data science problems for practice, and this is the question I'm currently tackling:
Given a list of L values generated independently by some unknown process, we will use the mean of L to predict unseen values generated by the same process. Use leave-one-out cross-validation to estimate the mean absolute error (MAE) of this process.
- Input: An array of floats arr
- Output: A float score
Example:
- arr = [1,2,3],
- score = 1.0
Now, usually, the input variables (X) and target variable (y) have the same number of rows. But in this case, since it says "we will use the mean of L to predict unseen values", what does y look like? Because in the given example, X has just one column, so if we take the mean of X, we will get a scalar value, which gives error when trying to do cross-validation:
from sklearn.model_selection import LeaveOneOut, cross_val_score
from sklearn.linear_model import LinearRegression
import numpy as np
input list of values
x = [[2, 5, 4, 3, 4, 6, 7, 5, 8, 9]]
define the output as the mean of the inputs, as specified in the question
y = [np.mean(x)]
build multiple linear regression model
model = LinearRegression()
define cross-validation method to use
cv = LeaveOneOut()
use LOOCV to evaluate model
scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
view mean absolute error
np.mean(np.absolute(scores))
>>>
Empty Traceback (most recent call last)
File ~/miniforge3/lib/python3.10/site-packages/joblib/parallel.py:862, in Parallel.dispatch_one_batch(self, iterator)
861 try:
--> 862 tasks = self._ready_batches.get(block=False)
863 except queue.Empty:
864 # slice the iterator n_jobs * batchsize items at a time. If the
865 # slice returns less than that, then the current batchsize puts
(...)
868 # accordingly to distribute evenly the last items between all
869 # workers.
File ~/miniforge3/lib/python3.10/queue.py:168, in Queue.get(self, block, timeout)
167 if not self._qsize():
--> 168 raise Empty
169 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Input In [70], in <cell line: 18>()
15 cv = LeaveOneOut()
17 # use LOOCV to evaluate model
---> 18 scores = cross_val_score(model, x, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
20 # view mean absolute error
21 np.mean(np.absolute(scores))
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:515, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
512 # To ensure multimetric format is not supported
513 scorer = check_scoring(estimator, scoring=scoring)
--> 515 cv_results = cross_validate(
516 estimator=estimator,
517 X=X,
518 y=y,
519 groups=groups,
520 scoring={"score": scorer},
521 cv=cv,
522 n_jobs=n_jobs,
523 verbose=verbose,
524 fit_params=fit_params,
525 pre_dispatch=pre_dispatch,
526 error_score=error_score,
527 )
528 return cv_results["test_score"]
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/miniforge3/lib/python3.10/site-packages/joblib/parallel.py:1085, in Parallel.call(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File ~/miniforge3/lib/python3.10/site-packages/joblib/parallel.py:873, in Parallel.dispatch_one_batch(self, iterator)
870 n_jobs = self._cached_effective_n_jobs
871 big_batch_size = batch_size * n_jobs
--> 873 islice = list(itertools.islice(iterator, big_batch_size))
874 if len(islice) == 0:
875 return False
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py:266, in <genexpr>(.0)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_split.py:86, in BaseCrossValidator.split(self, X, y, groups)
84 X, y, groups = indexable(X, y, groups)
85 indices = np.arange(_num_samples(X))
---> 86 for test_index in self._iter_test_masks(X, y, groups):
87 train_index = indices[np.logical_not(test_index)]
88 test_index = indices[test_index]
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_split.py:98, in BaseCrossValidator._iter_test_masks(self, X, y, groups)
93 def _iter_test_masks(self, X=None, y=None, groups=None):
94 """Generates boolean masks corresponding to test sets.
95
96 By default, delegates to _iter_test_indices(X, y, groups)
97 """
---> 98 for test_index in self._iter_test_indices(X, y, groups):
99 test_mask = np.zeros(_num_samples(X), dtype=bool)
100 test_mask[test_index] = True
File ~/miniforge3/lib/python3.10/site-packages/sklearn/model_selection/_split.py:163, in LeaveOneOut._iter_test_indices(self, X, y, groups)
161 n_samples = _num_samples(X)
162 if n_samples <= 1:
--> 163 raise ValueError(
164 "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples)
165 )
166 return range(n_samples)
ValueError: Cannot perform LeaveOneOut with n_samples=1.
Curiously, if I duplicate the contents of X and y, the error goes away, and a score of 0.0 is outputted:
# input list of values
x = [[2, 5, 4, 3, 4, 6, 7, 5, 8, 9], [2, 5, 4, 3, 4, 6, 7, 5, 8, 9]]
define the output as the mean of the inputs, as specified in the question
y = [np.mean(x),np.mean(x)]
...
...
...
>>> 0.0
Why is that?