Skip to content

Commit b2c7d8b

Browse files
authored
feat: support score() in GeminiTextGenerator (#740)
* feat: support score in GeminiTextGenerator * address comments * reorganize the tests
1 parent ad7d8ac commit b2c7d8b

File tree

3 files changed

+119
-0
lines changed

3 files changed

+119
-0
lines changed

bigframes/ml/llm.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,67 @@ def predict(
732732

733733
return df
734734

735+
def score(
736+
self,
737+
X: Union[bpd.DataFrame, bpd.Series],
738+
y: Union[bpd.DataFrame, bpd.Series],
739+
task_type: Literal[
740+
"text_generation", "classification", "summarization", "question_answering"
741+
] = "text_generation",
742+
) -> bpd.DataFrame:
743+
"""Calculate evaluation metrics of the model. Only "gemini-pro" model is supported for now.
744+
745+
.. note::
746+
747+
This product or feature is subject to the "Pre-GA Offerings Terms" in the General Service Terms section of the
748+
Service Specific Terms(https://cloud.google.com/terms/service-terms#1). Pre-GA products and features are available "as is"
749+
and might have limited support. For more information, see the launch stage descriptions
750+
(https://cloud.google.com/products#product-launch-stages).
751+
752+
.. note::
753+
754+
Output matches that of the BigQuery ML.EVALUTE function.
755+
See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#remote-model-llm
756+
for the outputs relevant to this model type.
757+
758+
Args:
759+
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
760+
A BigQuery DataFrame as evaluation data, which contains only one column of input_text
761+
that contains the prompt text to use when evaluating the model.
762+
y (bigframes.dataframe.DataFrame or bigframes.series.Series):
763+
A BigQuery DataFrame as evaluation labels, which contains only one column of output_text
764+
that you would expect to be returned by the model.
765+
task_type (str):
766+
The type of the task for LLM model. Default to "text_generation".
767+
Possible values: "text_generation", "classification", "summarization", and "question_answering".
768+
769+
Returns:
770+
bigframes.dataframe.DataFrame: The DataFrame as evaluation result.
771+
"""
772+
if not self._bqml_model:
773+
raise RuntimeError("A model must be fitted before score")
774+
775+
# TODO(ashleyxu): Support gemini-1.5 when the rollout is ready. b/344891364.
776+
if self._bqml_model.model_name.startswith("gemini-1.5"):
777+
raise NotImplementedError("Score is not supported for gemini-1.5 model.")
778+
779+
X, y = utils.convert_to_dataframe(X, y)
780+
781+
if len(X.columns) != 1 or len(y.columns) != 1:
782+
raise ValueError(
783+
f"Only support one column as input for X and y. {constants.FEEDBACK_LINK}"
784+
)
785+
786+
# BQML identified the column by name
787+
X_col_label = cast(blocks.Label, X.columns[0])
788+
y_col_label = cast(blocks.Label, y.columns[0])
789+
X = X.rename(columns={X_col_label: "input_text"})
790+
y = y.rename(columns={y_col_label: "output_text"})
791+
792+
input_data = X.join(y, how="outer")
793+
794+
return self._bqml_model.llm_evaluate(input_data, task_type)
795+
735796
def to_gbq(self, model_name: str, replace: bool = False) -> GeminiTextGenerator:
736797
"""Save the model to BigQuery.
737798

tests/system/small/ml/conftest.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,18 @@ def llm_text_pandas_df():
171171
)
172172

173173

174+
@pytest.fixture(scope="session")
175+
def llm_fine_tune_df_default_index(
176+
session: bigframes.Session,
177+
) -> bigframes.dataframe.DataFrame:
178+
training_table_name = "llm_tuning.emotion_classification_train"
179+
df = session.read_gbq(training_table_name)
180+
prefix = "Please do sentiment analysis on the following text and only output a number from 0 to 5 where 0 means sadness, 1 means joy, 2 means love, 3 means anger, 4 means fear, and 5 means surprise. Text: "
181+
df["prompt"] = prefix + df["text"]
182+
df["label"] = df["label"].astype("string")
183+
return df
184+
185+
174186
@pytest.fixture(scope="session")
175187
def onnx_iris_pandas_df():
176188
"""Data matching the iris dataset."""

tests/system/small/ml/test_llm.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import pytest
1616

1717
from bigframes.ml import llm
18+
from tests.system import utils
1819

1920

2021
def test_create_text_generator_model(
@@ -366,3 +367,48 @@ def test_gemini_text_generator_predict_with_params_success(
366367
assert "ml_generate_text_llm_result" in df.columns
367368
series = df["ml_generate_text_llm_result"]
368369
assert all(series.str.len() > 20)
370+
371+
372+
@pytest.mark.flaky(retries=2)
373+
def test_llm_gemini_pro_score(llm_fine_tune_df_default_index):
374+
model = llm.GeminiTextGenerator(model_name="gemini-pro")
375+
376+
# Check score to ensure the model was fitted
377+
score_result = model.score(
378+
X=llm_fine_tune_df_default_index[["prompt"]],
379+
y=llm_fine_tune_df_default_index[["label"]],
380+
).to_pandas()
381+
utils.check_pandas_df_schema_and_index(
382+
score_result,
383+
columns=[
384+
"bleu4_score",
385+
"rouge-l_precision",
386+
"rouge-l_recall",
387+
"rouge-l_f1_score",
388+
"evaluation_status",
389+
],
390+
index=1,
391+
)
392+
393+
394+
@pytest.mark.flaky(retries=2)
395+
def test_llm_gemini_pro_score_params(llm_fine_tune_df_default_index):
396+
model = llm.GeminiTextGenerator(model_name="gemini-pro")
397+
398+
# Check score to ensure the model was fitted
399+
score_result = model.score(
400+
X=llm_fine_tune_df_default_index["prompt"],
401+
y=llm_fine_tune_df_default_index["label"],
402+
task_type="classification",
403+
).to_pandas()
404+
utils.check_pandas_df_schema_and_index(
405+
score_result,
406+
columns=[
407+
"precision",
408+
"recall",
409+
"f1_score",
410+
"label",
411+
"evaluation_status",
412+
],
413+
index=6,
414+
)

0 commit comments

Comments
 (0)