Merge branch 'main' into add-keypoint-detection

huggingface · Aug 31, 2024 · dbe94fa · dbe94fa
2 parents e8551a4 + 94cb7fe
commit dbe94fa
Show file tree

Hide file tree

Showing 31 changed files with 249 additions and 107 deletions.
diff --git a/packages/tasks/package.json b/packages/tasks/package.json
@@ -1,7 +1,7 @@
 {
 	"name": "@huggingface/tasks",
 	"packageManager": "pnpm@8.10.5",
-	"version": "0.11.11",
+	"version": "0.11.12",
 	"description": "List of ML tasks for huggingface.co/tasks",
 	"repository": "https://github.com/huggingface/huggingface.js.git",
 	"publishConfig": {

diff --git a/packages/tasks/src/model-libraries-snippets.ts b/packages/tasks/src/model-libraries-snippets.ts
@@ -170,6 +170,48 @@ export const diffusers = (model: ModelData): string[] => {
 	}
 };
 
+export const diffusionkit = (model: ModelData): string[] => {
+	const sd3Snippet = `# Pipeline for Stable Diffusion 3
+from diffusionkit.mlx import DiffusionPipeline
+
+pipeline = DiffusionPipeline(
+	shift=3.0,
+	use_t5=False,
+	model_version=${model.id},
+	low_memory_mode=True,
+	a16=True,
+	w16=True,
+)`;
+
+	const fluxSnippet = `# Pipeline for Flux
+from diffusionkit.mlx import FluxPipeline
+
+pipeline = FluxPipeline(
+  shift=1.0,
+  model_version=${model.id},
+  low_memory_mode=True,
+  a16=True,
+  w16=True,
+)`;
+
+	const generateSnippet = `# Image Generation
+HEIGHT = 512
+WIDTH = 512
+NUM_STEPS = ${model.tags.includes("flux") ? 4 : 50}
+CFG_WEIGHT = ${model.tags.includes("flux") ? 0 : 5}
+
+image, _ = pipeline.generate_image(
+  "a photo of a cat",
+  cfg_weight=CFG_WEIGHT,
+  num_steps=NUM_STEPS,
+  latent_size=(HEIGHT // 8, WIDTH // 8),
+)`;
+
+	const pipelineSnippet = model.tags.includes("flux") ? fluxSnippet : sd3Snippet;
+
+	return [pipelineSnippet, generateSnippet];
+};
+
 export const cartesia_pytorch = (model: ModelData): string[] => [
 	`# pip install --no-binary :all: cartesia-pytorch
 from cartesia_pytorch import ReneLMHeadModel

diff --git a/packages/tasks/src/model-libraries.ts b/packages/tasks/src/model-libraries.ts
@@ -155,7 +155,7 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
 		repoName: "deepforest",
 		docsUrl: "https://deepforest.readthedocs.io/en/latest/",
 		repoUrl: "https://github.com/weecology/DeepForest",
-		countDownloads: `path_extension:"pt"`,
+		countDownloads: `path_extension:"pt" OR path_extension:"pl"`,
 	},
 	"depth-anything-v2": {
 		prettyLabel: "DepthAnythingV2",
@@ -181,6 +181,12 @@ export const MODEL_LIBRARIES_UI_ELEMENTS = {
 		filter: true,
 		/// diffusers has its own more complex "countDownloads" query
 	},
+	diffusionkit: {
+		prettyLabel: "DiffusionKit",
+		repoName: "DiffusionKit",
+		repoUrl: "https://github.com/argmaxinc/DiffusionKit",
+		snippets: snippets.diffusionkit,
+	},
 	doctr: {
 		prettyLabel: "docTR",
 		repoName: "doctr",

diff --git a/packages/tasks/src/tasks/audio-classification/inference.ts b/packages/tasks/src/tasks/audio-classification/inference.ts
@@ -8,9 +8,10 @@
  */
 export interface AudioClassificationInput {
 	/**
-	 * The input audio data
+	 * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the audio data as a raw bytes payload.
 	 */
-	inputs: unknown;
+	inputs: string;
 	/**
 	 * Additional inference parameters
 	 */

diff --git a/packages/tasks/src/tasks/audio-classification/spec/input.json b/packages/tasks/src/tasks/audio-classification/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input audio data"
+			"description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",

diff --git a/packages/tasks/src/tasks/audio-classification/spec/output.json b/packages/tasks/src/tasks/audio-classification/spec/output.json
@@ -5,6 +5,7 @@
 	"description": "Outputs for Audio Classification inference",
 	"type": "array",
 	"items": {
+		"type": "object",
 		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts b/packages/tasks/src/tasks/automatic-speech-recognition/inference.ts
@@ -9,9 +9,10 @@
  */
 export interface AutomaticSpeechRecognitionInput {
 	/**
-	 * The input audio data
+	 * The input audio data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the audio data as a raw bytes payload.
 	 */
-	inputs: unknown;
+	inputs: string;
 	/**
 	 * Additional inference parameters
 	 */

diff --git a/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json b/packages/tasks/src/tasks/automatic-speech-recognition/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input audio data"
+			"description": "The input audio data as a base64-encoded string. If no `parameters` are provided, you can also provide the audio data as a raw bytes payload.",
+			"type": "string"
 		},
 		"parameters": {
 			"description": "Additional inference parameters",

diff --git a/packages/tasks/src/tasks/common-definitions.json b/packages/tasks/src/tasks/common-definitions.json
@@ -7,17 +7,7 @@
 			"title": "ClassificationOutputTransform",
 			"type": "string",
 			"description": "The function to apply to the model outputs in order to retrieve the scores.",
-			"oneOf": [
-				{
-					"const": "sigmoid"
-				},
-				{
-					"const": "softmax"
-				},
-				{
-					"const": "none"
-				}
-			]
+			"enum": ["sigmoid", "softmax", "none"]
 		},
 		"ClassificationOutput": {
 			"title": "ClassificationOutput",
@@ -84,16 +74,9 @@
 					"description": "Whether to use sampling instead of greedy decoding when generating new tokens."
 				},
 				"early_stopping": {
+					"type": ["boolean", "string"],
 					"description": "Controls the stopping condition for beam-based methods.",
-					"oneOf": [
-						{
-							"type": "boolean"
-						},
-						{
-							"const": "never",
-							"type": "string"
-						}
-					]
+					"enum": ["never", true, false]
 				},
 				"num_beams": {
 					"type": "integer",

diff --git a/packages/tasks/src/tasks/image-classification/inference.ts b/packages/tasks/src/tasks/image-classification/inference.ts
@@ -8,9 +8,10 @@
  */
 export interface ImageClassificationInput {
 	/**
-	 * The input image data
+	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: unknown;
+	inputs: string;
 	/**
 	 * Additional inference parameters
 	 */

diff --git a/packages/tasks/src/tasks/image-classification/spec/input.json b/packages/tasks/src/tasks/image-classification/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input image data"
+			"type": "string",
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
 		},
 		"parameters": {
 			"description": "Additional inference parameters",

diff --git a/packages/tasks/src/tasks/image-classification/spec/output.json b/packages/tasks/src/tasks/image-classification/spec/output.json
@@ -5,6 +5,7 @@
 	"title": "ImageClassificationOutput",
 	"type": "array",
 	"items": {
+		"type": "object",
 		"$ref": "/inference/schemas/common-definitions.json#/definitions/ClassificationOutput"
 	}
 }
diff --git a/packages/tasks/src/tasks/image-segmentation/inference.ts b/packages/tasks/src/tasks/image-segmentation/inference.ts
@@ -8,9 +8,10 @@
  */
 export interface ImageSegmentationInput {
 	/**
-	 * The input image data
+	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: unknown;
+	inputs: string;
 	/**
 	 * Additional inference parameters
 	 */
@@ -41,6 +42,9 @@ export interface ImageSegmentationParameters {
 	threshold?: number;
 	[property: string]: unknown;
 }
+/**
+ * Segmentation task to be performed, depending on model capabilities.
+ */
 export type ImageSegmentationSubtask = "instance" | "panoptic" | "semantic";
 export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
 /**
@@ -50,15 +54,15 @@ export type ImageSegmentationOutput = ImageSegmentationOutputElement[];
  */
 export interface ImageSegmentationOutputElement {
 	/**
-	 * The label of the predicted segment
+	 * The label of the predicted segment.
 	 */
 	label: string;
 	/**
-	 * The corresponding mask as a black-and-white image
+	 * The corresponding mask as a black-and-white image (base64-encoded).
 	 */
-	mask: unknown;
+	mask: string;
 	/**
-	 * The score or confidence degreee the model has
+	 * The score or confidence degree the model has.
 	 */
 	score?: number;
 	[property: string]: unknown;

diff --git a/packages/tasks/src/tasks/image-segmentation/spec/input.json b/packages/tasks/src/tasks/image-segmentation/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input image data"
+			"type": "string",
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -31,17 +32,7 @@
 					"title": "ImageSegmentationSubtask",
 					"type": "string",
 					"description": "Segmentation task to be performed, depending on model capabilities.",
-					"oneOf": [
-						{
-							"const": "instance"
-						},
-						{
-							"const": "panoptic"
-						},
-						{
-							"const": "semantic"
-						}
-					]
+					"enum": ["instance", "panoptic", "semantic"]
 				},
 				"threshold": {
 					"type": "number",

diff --git a/packages/tasks/src/tasks/image-segmentation/spec/output.json b/packages/tasks/src/tasks/image-segmentation/spec/output.json
@@ -10,14 +10,15 @@
 		"properties": {
 			"label": {
 				"type": "string",
-				"description": "The label of the predicted segment"
+				"description": "The label of the predicted segment."
 			},
 			"mask": {
-				"description": "The corresponding mask as a black-and-white image"
+				"type": "string",
+				"description": "The corresponding mask as a black-and-white image (base64-encoded)."
 			},
 			"score": {
 				"type": "number",
-				"description": "The score or confidence degreee the model has"
+				"description": "The score or confidence degree the model has."
 			}
 		},
 		"required": ["label", "mask"]

diff --git a/packages/tasks/src/tasks/image-to-image/inference.ts b/packages/tasks/src/tasks/image-to-image/inference.ts
@@ -9,9 +9,10 @@
  */
 export interface ImageToImageInput {
 	/**
-	 * The input image data
+	 * The input image data as a base64-encoded string. If no `parameters` are provided, you can
+	 * also provide the image data as a raw bytes payload.
 	 */
-	inputs: unknown;
+	inputs: string;
 	/**
 	 * Additional inference parameters
 	 */
@@ -40,14 +41,14 @@ export interface ImageToImageParameters {
 	 */
 	num_inference_steps?: number;
 	/**
-	 * The size in pixel of the output image
+	 * The size in pixel of the output image.
 	 */
 	target_size?: TargetSize;
 	[property: string]: unknown;
 }
 
 /**
- * The size in pixel of the output image
+ * The size in pixel of the output image.
  */
 export interface TargetSize {
 	height: number;
@@ -60,7 +61,7 @@ export interface TargetSize {
  */
 export interface ImageToImageOutput {
 	/**
-	 * The output image
+	 * The output image returned as raw bytes in the payload.
 	 */
 	image?: unknown;
 	[property: string]: unknown;

diff --git a/packages/tasks/src/tasks/image-to-image/spec/input.json b/packages/tasks/src/tasks/image-to-image/spec/input.json
@@ -6,7 +6,8 @@
 	"type": "object",
 	"properties": {
 		"inputs": {
-			"description": "The input image data"
+			"type": "string",
+			"description": "The input image data as a base64-encoded string. If no `parameters` are provided, you can also provide the image data as a raw bytes payload."
 		},
 		"parameters": {
 			"description": "Additional inference parameters",
@@ -36,7 +37,7 @@
 				},
 				"target_size": {
 					"type": "object",
-					"description": "The size in pixel of the output image",
+					"description": "The size in pixel of the output image.",
 					"properties": {
 						"width": {
 							"type": "integer"

diff --git a/packages/tasks/src/tasks/image-to-image/spec/output.json b/packages/tasks/src/tasks/image-to-image/spec/output.json
@@ -6,7 +6,7 @@
 	"type": "object",
 	"properties": {
 		"image": {
-			"description": "The output image"
+			"description": "The output image returned as raw bytes in the payload."
 		}
 	}
 }
diff --git a/packages/tasks/src/tasks/index.ts b/packages/tasks/src/tasks/index.ts
@@ -73,12 +73,7 @@ export type * from "./table-question-answering/inference";
 export type { TextToImageInput, TextToImageOutput, TextToImageParameters } from "./text-to-image/inference";
 export type { TextToAudioParameters, TextToSpeechInput, TextToSpeechOutput } from "./text-to-speech/inference";
 export type * from "./token-classification/inference";
-export type {
-	Text2TextGenerationParameters,
-	Text2TextGenerationTruncationStrategy,
-	TranslationInput,
-	TranslationOutput,
-} from "./translation/inference";
+export type { TranslationInput, TranslationOutput } from "./translation/inference";
 export type {
 	ClassificationOutputTransform,
 	TextClassificationInput,