Swift开发者的Text Generation Inference入门到精通指南 (2025年05月)

引言

在2025年的今天，文本生成(Text Generation)已经成为移动应用开发中不可或缺的功能。作为Swift开发者，掌握如何在iOS/macOS应用中集成文本生成推理(Text Generation Inference)能力将大大提升你的应用竞争力。本文将带你从零开始，逐步掌握如何在Swift项目中实现高效的文本生成功能。

准备工作

在开始之前，请确保你的开发环境满足以下要求：

Xcode 15+ (推荐最新稳定版)
macOS 13+ 或 iOS 16+ 作为目标平台
Swift 5.9+
Python环境(用于模型转换)
至少8GB可用内存(运行较大模型需要更多)

安装必要工具

代码片段

# 安装Swift包管理工具
brew install swift

# 安装Python依赖
pip install torch transformers onnxruntime

第一步：理解Text Generation Inference基础

文本生成推理的核心是将预训练的语言模型(如GPT、LLaMA等)部署到你的应用中，使其能够根据输入提示(prompt)生成连贯的文本。

Swift中的实现方式

在Swift生态中，我们主要有三种实现方式：

本地运行：将模型转换为Core ML格式直接运行
服务器API调用：通过HTTP请求访问远程服务
混合模式：小模型本地运行，大模型远程调用

本文将重点介绍第一种方式——本地运行模型。

第二步：准备文本生成模型

我们将使用Hugging Face的transformers库来获取和转换一个适合移动端的小型语言模型。

选择适合移动端的模型

对于移动设备，推荐使用以下小型模型：
– DistilGPT-2 (小型GPT-2)
– TinyLLaMA (精简版LLaMA)
– MobileBERT (专为移动设备优化)

代码片段

# Python代码：导出DistilGPT-2为ONNX格式
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# 加载模型和分词器
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# 示例输入
inputs = tokenizer("Hello, how are", return_tensors="pt")

# 导出为ONNX格式
torch.onnx.export(
    model,
    tuple(inputs.values()),
    f="distilgpt2.onnx",
    input_names=["input_ids", "attention_mask"],
    output_names=["logits"],
    dynamic_axes={
        "input_ids": {0: "batch", 1: "sequence"},
        "attention_mask": {0: "batch", 1: "sequence"},
        "logits": {0: "batch", 1: "sequence"}
    },
    opset_version=13
)

转换为Core ML格式

使用苹果的coremltools将ONNX转换为Core ML格式：

代码片段

import coremltools as ct

# 加载ONNX模型
onnx_model = ct.converters.onnx.load("distilgpt2.onnx")

# 转换为Core ML格式
coreml_model = ct.convert(
    onnx_model,
    convert_to="mlprogram",
    inputs=[ct.TensorType(name="input_ids", shape=(1, ct.RangeDim(1, 128))),
            ct.TensorType(name="attention_mask", shape=(1, ct.RangeDim(1, 128)))]
)

# 保存Core ML模型
coreml_model.save("DistilGPT2.mlpackage")

第三步：在Swift项目中集成Core ML模型

创建新项目并添加模型

Xcode中创建新项目（File > New > Project）
选择”App”模板
将生成的DistilGPT2.mlpackage拖入项目导航器

Swift实现文本生成逻辑

代码片段

import CoreML
import Foundation

class TextGenerator {
    private let model: DistilGPT2

    // Tokenizer相关常量（简化版）
    private let maxLength = 128
    private let eosTokenId = 50256 // GPT-2的结束标记

    init() throws {
        // Load the Core ML model
        let config = MLModelConfiguration()
        config.computeUnits = .cpuAndGPU // CPU和GPU混合计算

        self.model = try DistilGPT2(configuration: config)
    }

    func generateText(prompt: String, maxNewTokens: Int = 50) -> String {
        // Tokenize输入（简化处理）
        var tokenIds = tokenize(text: prompt)

        // Generate tokens one by one
        for _ in 0..<maxNewTokens {
            guard let nextTokenId = predictNextToken(inputIds: tokenIds) else {
                break
            }

            tokenIds.append(nextTokenId)

            if nextTokenId == eosTokenId { // Stop at end-of-sequence token
                break
            }

            if tokenIds.count >= maxLength { // Prevent infinite loops
                break
            }
        }

        return detokenize(tokenIds: tokenIds)
    }

    private func predictNextToken(inputIds: [Int]) -> Int? {
        do {
            // Prepare input as multi-arrays (Core ML expects NSArray inputs)
            let inputArray = try MLMultiArray(shape: [1, NSNumber(value: inputIds.count)], dataType: .int32)
            let maskArray = try MLMultiArray(shape: [1, NSNumber(value: inputIds.count)], dataType: .int32)

            for (index, id) in inputIds.enumerated() {
                inputArray[index] = NSNumber(value: id)
                maskArray[index] = NSNumber(value: 1) // All tokens are active initially
            }

            // Create model input with the correct feature names from the CoreML model's metadata.
            let input = DistilGPT2Input(
                input_ids: inputArray,
                attention_mask: maskArray,
                use_cache_optional: nil,
                past_key_values_optional_: nil,
                position_ids_optional_: nil,
                output_attentions_optional_: nil,
                output_hidden_states_optional_: nil,
                return_dict_optional_: nil)

            // Make prediction - this is the actual inference call to the CoreML model.
            let output = try model.prediction(input: input)

            // Get logits and select the token with highest probability (greedy decoding).
            let logitsPointer = output.logits.dataPointer.bindMemory(to: Float32.self, capacity: output.logits.count)

            var maxValueIndex = -1

            if !inputIds.isEmpty {
                let lastTokenLogitsStartIndex = (inputIds.count -  1) * output.logits.shape[3].intValue * output.logits.shape[4].intValue * output.logits.shape[5].intValue;

                var maxValue : Float32 = -Float.infinity;

                for i in  0..<output.logits.shape[5].intValue {
                    let value : Float32 = logitsPointer[lastTokenLogitsStartIndex + i]
                    if value > maxValue {
                        maxValueIndex = i;
                        maxValue     = value;
                    }
                }

                return maxValueIndex != -1 ? maxValueIndex : nil;

            } else {

               return nil;

           }
       } catch {
           print("Prediction error:", error.localizedDescription);
           return nil;
       }
   }

   // Simplified tokenizer - in production use a proper tokenizer implementation!
   private func tokenize(text:String)->[Int]{
       return text.compactMap{ $0.asciiValue }.map{ Int($0) %  10000 }// Simplified mapping!
   }

   private func detokenize(tokenIds:[Int])->String{
       return String(tokenIds.compactMap{ UnicodeScalar($0 %  10000).map{ Character($0) } })
   } 
}

SwiftUI中使用文本生成器

代码片段

import SwiftUI

struct ContentView: View {
    @State private var promptText = ""
    @State private var generatedText = ""

    @StateObject private var generatorViewModel = GeneratorViewModel()

    var body: some View {
        VStack(spacing:20){
            TextField("Enter your prompt...",text:$promptText)
                .textFieldStyle(.roundedBorder)

            Button("Generate Text"){
                generatedText=generatorViewModel.generateText(prompt:textPromptInput.text ?? "")
             }.buttonStyle(.borderedProminent).disabled(generatorViewModel.isGenerating )

             ScrollView{
                 Text(generatedText).padding()
             }.frame(maxWidth:.infinity,maxHeight:.infinity).background(Color(.systemGray6))
         }.padding()
     }
 }

 @MainActor class GeneratorViewModel : ObservableObject{

     @Published var isGenerating=false 
     @Published var generatedText="" 

     private lazy var textGenerator : TextGenerator?={
         do{
             return try TextGenerator()
         }catch{
             print("Failed to initialize text generator:",error.localizedDescription);
             return nil; 
         } 
     }()

     func generateText(prompt:String)->String{

         guard !prompt.isEmpty else{return ""} 

         isGenerating=true 

         defer{isGenerating=false} 

         guard let generator=textGenerator else{return "Error initializing model"} 

         DispatchQueue.global(qos:.userInitiated).async{ [weak self] in

             let result=generator.generateText(prompt:textPrompt,maxNewTokens :50 )
             DispatchQueue.main.async{
                 self?.generatedText=result ?? ""
              }  
          }  

          return ""  
      }  
}

第四步：优化与进阶技巧

GPU加速与内存优化

代码片段

// ModelConfiguration优化示例：
let configuration=MLModelConfiguration()

// GPU优先计算（A14芯片及以上支持）
configuration.computeUnits=.cpuAndGPU 

// CPU线程数设置（针对大型模型）
configuration.allowLowPrecisionAccumulationOnGPU=true 

// CPU线程数设置（针对大型模型）
configuration.cpuThreadCount=4 

// Metal性能提示（针对连续推理场景）
configuration.preferredMetalDevice=MTLCreateSystemDefaultDevice() 

let model=try MyModel(configuration : configuration )

Tokenizer优化建议

在实际应用中，应该使用完整的Tokenizer实现：

代码片段

class TokenizerUtils {

static func loadTokenizer(from vocabURL : URL )->[String : Int ]?{

guard FileManager.default.fileExists(atPath : vocabURL.path )else{

print("Vocabulary file not found at \(vocabURL.path )")

return nil }

do{

let data=try Data(contentsOf : vocabURL )

if let jsonObject=try JSONSerialization.jsonObject(with : data )as?[String : Int ]{

return jsonObject }

}catch{

print("Error loading vocabulary:",error )

}

return nil }

static func encode(text:String ,vocab:[String:String])->[Int]{

// Implement proper subword encoding here...

return [] }

static func decode(tokens:[Int],invVocab:[Int:String])->String{

// Implement proper subword decoding here...

return "" }

}

FAQ与常见问题解决

Q: Core ML转换失败怎么办？
A:
1.检查ONNX模型的版本是否兼容当前coremltools版本。
2.尝试降低opset版本。
3.简化模型的输入输出结构。

Q: iOS上运行速度慢怎么办？
A:
1.确保启用GPU加速。
2.减少max_length参数。
3.使用量化后的更小模型。

Q: App体积过大如何解决？
A:
1.使用App Thinning功能按设备分发不同精度的模型。
2.考虑动态下载模型的方案。

总结与未来展望

通过本文的学习，你已经掌握了在Swift项目中集成文本生成推理的基本方法。关键步骤包括：

1.选择合适的轻量级语言模型并进行格式转换。
2.CoreML模型的正确加载和配置。
3.Swift中实现基本的文本生成逻辑。
4.UI界面的构建和性能优化。

随着苹果芯片性能的不断提升和CoreML框架的持续改进，本地运行的文本生成能力将会变得更加强大。未来可以探索的方向包括：

更大型模型的优化部署
多模态生成能力
实时交互式生成体验

希望这篇指南能帮助你在Swift生态中构建出令人惊艳的文本生成应用！