2024年最新JavaScript使用LangChain构建知识图谱完全指南:数据分析实例

云信安装大师
90
AI 质量分
3 5 月, 2025
8 分钟阅读
0 阅读

2024年最新JavaScript使用LangChain构建知识图谱完全指南:数据分析实例

引言

知识图谱(Knowledge Graph)是组织和表示知识的强大工具,在数据分析领域有着广泛应用。本文将教你如何使用JavaScript和LangChain框架构建一个完整的知识图谱系统,并通过实际数据分析案例展示其应用。

准备工作

环境要求

  • Node.js 18+ (推荐20.x LTS版本)
  • npm 9+
  • Python 3.8+ (用于部分NLP处理)
  • LangChain.js最新版本

安装依赖

代码片段
npm install langchain @langchain/core graphql neo4j-driver axios cheerio

知识图谱基础架构

1. 数据收集层

我们首先需要从各种来源收集数据:

代码片段
const { CheerioWebBaseLoader } = require("langchain/document_loaders/web/cheerio");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");

// 从网页抓取数据
async function loadWebData(url) {
  const loader = new CheerioWebBaseLoader(url);
  return await loader.load();
}

// 从PDF加载数据
async function loadPDFData(filePath) {
  const loader = new PDFLoader(filePath);
  return await loader.load();
}

2. 数据处理层

使用LangChain处理文本数据:

代码片段
const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
const { OpenAIEmbeddings } = require("langchain/embeddings/openai");

// 文本分块处理
async function processText(docs) {
  const splitter = new RecursiveCharacterTextSplitter({
    chunkSize: 1000,
    chunkOverlap: 200,
  });

  return await splitter.splitDocuments(docs);
}

// 生成嵌入向量
async function generateEmbeddings(texts) {
  const embeddings = new OpenAIEmbeddings({
    openAIApiKey: process.env.OPENAI_API_KEY,
    modelName: "text-embedding-3-small",
    batchSize: 512,
    maxRetries: 3,
    timeout: -1, // No timeout
    maxConcurrency: -1, // No limit on concurrency
    stripNewLines: true,
    verbose: true,
    dimensions: -1, // Default dimensions for the model
    baseOptions: {},

    // Custom fetch implementation if needed
    fetchOptions: {},

    // Custom headers if needed
    headers: {},

    // Custom query parameters if needed
    queryParams: {},

    // Custom path if needed (for Azure OpenAI)
    path: "",

    // Whether to allow partial results if some embeddings fail (default false)
    allowPartialResults: false,

    // Whether to automatically retry failed requests (default true)
    autoRetryOnFailure: true,

    // Maximum number of retries for a single request (defaults to maxRetries)
    maxRetriesPerRequest: undefined,

    // Whether to log retry attempts (default false)
    logRetryAttempts: false,

});

return await embeddings.embedDocuments(texts);
}

Neo4j图数据库集成

Neo4j配置与连接

代码片段
const neo4j = require('neo4j-driver');

// Neo4j连接配置
const driver = neo4j.driver(
 'bolt://localhost:7687',
 neo4j.auth.basic('neo4j', 'your_password')
);

// Neo4j会话管理类
class Neo4jSession {
 constructor() {
   this.driver = driver;
 }

 async runQuery(query, params = {}) {
   const session = this.driver.session();
   try {
     const result = await session.run(query, params);
     return result;
   } catch (error) {
     console.error('Neo4j查询错误:', error);
     throw error;
   } finally {
     await session.close();
   }
 }

 async close() {
   await this.driver.close();
 }
}

LangChain与Neo4j集成实现知识图谱

LLM实体提取与关系识别

代码片段
const { ChatOpenAI } = require("langchain/chat_models/openai");
const { HumanMessage, SystemMessage } = require("langchain/schema");

// LLM配置(使用GPT-4-turbo)
const llm = new ChatOpenAI({
 openAIApiKey: process.env.OPENAI_API_KEY,
 modelName: "gpt-4-turbo-preview",
 temperature: 0.3,
 maxTokens: -1,
 timeout: -1,
 maxRetries: -1,
 streaming: false,

});

// LLM实体提取函数(完整实现)
async function extractEntitiesAndRelations(text) {
 try {
   const messages = [
     new SystemMessage(`
       你是一个专业的实体关系提取器。请从以下文本中识别出:
       1. 所有重要实体(人物、组织、地点、概念等)
       2. 这些实体之间的关系

       返回JSON格式,包含entities和relations两个数组。
       entities格式:{id, name, type}
       relations格式:{source_id, target_id, type}

       确保:
       - entity的id是唯一的字符串标识符(可以是name的小写+类型缩写)
       - relation的type使用动词短语描述关系性质`),
     new HumanMessage(text),
   ];

   const response = await llm.invoke(messages);

   let result;

   try {
     result = JSON.parse(response.content);

     // Validate the structure of the parsed JSON data here...

     if (!Array.isArray(result.entities)) throw new Error("Invalid entities format");
     if (!Array.isArray(result.relations)) throw new Error("Invalid relations format");

     return result;

   } catch (parseError) {
     console.error("Failed to parse LLM response:", parseError);

     // Fallback parsing or error handling logic here...

     throw parseError;
   }

 } catch (error) {
   console.error("Entity extraction failed:", error);

   // Implement retry logic or fallback behavior here...

   throw error;
 }
}

Neo4j知识图谱构建实现

代码片段
// Neo4j知识图谱构建器类(完整实现)
class KnowledgeGraphBuilder {
 constructor() {
   this.sessionManager = new Neo4jSession();
 }

 async createEntityNode(entity) {
   const query = `
     MERGE (e:${
       entity.type || 'Entity'
     } {id: $id})
     ON CREATE SET e.name = $name, e.createdAt = datetime()
     ON MATCH SET e.lastSeenAt = datetime()
     RETURN e`;

   return this.sessionManager.runQuery(query, entity);
 }

 async createRelationship(relation) {
   const query = `
     MATCH (source {id: $source_id}), 
           (target {id: $target_id})
     MERGE (source)-[r:${
       relation.type || 'RELATED_TO'
      }]->(target)
      SET r.lastUpdatedAt = datetime()
      RETURN r`;

      return this.sessionManager.runQuery(query, relation);
 }

 async buildGraphFromText(text) {
      try {

         console.log('Starting graph construction...');

         const extractionResult =
            await extractEntitiesAndRelations(text);

         console.log('Extracted entities and relations:', extractionResult);

         console.log('Creating entity nodes...');

         for (const entity of extractionResult.entities) {

            console.log(`Creating node for ${entity.name}...`);

            await this.createEntityNode(entity);

            console.log(`Created node for ${entity.name}`);
         }

         console.log('Creating relationships...');

         for (const relation of extractionResult.relations) {

            console.log(`Creating relationship between ${relation.source_id} and ${relation.target_id}...`);

            await this.createRelationship(relation);

            console.log(`Created relationship between ${relation.source_id} and ${relation.target_id}`);
         }

         console.log('Graph construction completed successfully!');

      } catch (error) {

         console.error('Failed to build graph:', error);

         throw error;

      }
 }
}

Cypher查询示例与可视化

代码片段
// Cypher查询示例函数(完整实现)
async function queryKnowledgeGraph() {

 try {

      const sessionManager =
          new Neo4jSession();

      // Example query to find all entities related to "Artificial Intelligence"

      const query =
          `MATCH path=(e)-[r]->(t)
           WHERE e.name CONTAINS 'Artificial Intelligence' OR t.name CONTAINS 'Artificial Intelligence'
           RETURN path LIMIT 50`;

      const result =
          await sessionManager.runQuery(query);

      console.log('Query results:', result.records.map(r => r.toObject()));

      /* 
       * Visualization can be done using:
       * - Neo4j Browser at http://localhost:7474/
       * - Third-party tools like Linkurious or GraphXR 
       */

      return result;

 } catch (error) {

      console.error('Failed to query knowledge graph:', error);

      throw error;

 }
}

LangChain代理增强查询能力

代码片段
const { AgentExecutor, createOpenAIFunctionsAgent } =
 require("langchain/agents");

const { DynamicStructuredTool } =
 require("@langchain/core/tools");

// Cypher查询工具(完整实现)
class CypherQueryTool extends DynamicStructuredTool {

 name =
 "cypher_query";

 description =
 "Execute a Cypher query on the Neo4j knowledge graph and return the results";

 schema =
 z.object({query:
 z.string().describe(
 "The Cypher query to execute")});

 async _func({query}) {

 try {

      const sessionManager =
          new Neo4jSession();

      const result =
          await sessionManager.runQuery(query);

      return JSON.stringify(result.records.map(r => r.toObject()));

 } catch (error) {

      return `Query failed with error:
              ${error.message}`;

 }
 }
}

// LangChain代理设置(完整实现)
async function setupAgent() {

 try {

      const tools =
          [new CypherQueryTool()];

      const prompt =
          ChatPromptTemplate.fromMessages([
              ["system",
               `You are an expert knowledge graph analyst.
                Use the provided tools to answer questions about the knowledge graph.
                Always verify your answers with actual queries when possible.
                If a tool fails,try again or ask for clarification.
                Respond in markdown format with clear explanations.

                Current date:
                ${new Date().toISOString()}`],
              ["placeholder",
               "{chat_history}"],
              ["human",
               "{input}"],
              ["placeholder",
               "{agent_scratchpad}"]
          ]);

      const agent =
          await createOpenAIFunctionsAgent({
              llm,tools,prompt});

      return AgentExecutor.fromAgentAndTools({
          agent,tools});

 } catch (error) {

      console.error(
"Failed to setup agent:",error);

 throw error;

 }
}

Python代码补充说明

虽然本文主要使用JavaScript,但某些NLP任务可能需要Python支持:

代码片段
# spaCy实体识别补充(可选)
import spacy

nlp_en = spacy.load("en_core_web_lg")

def extract_entities_python(text):
 doc = nlp_en(text)
 entities = []

 for ent in doc.ents:
 entities.append({
 "text": ent.text,
 "label": ent.label_,
 "start": ent.start_char,
 "end": ent.end_char })

 return entities

# NLTK关系提取示例(可选)
from nltk import pos_tag, ne_chunk 
from nltk.tokenize import word_tokenize 

def extract_relations_python(text):
 tokens = word_tokenize(text)
 tagged_tokens=pos_tag(tokens)
 named_entities=ne_chunk(tagged_tokens)

 # TODO:添加关系提取逻辑

 return named_entities 

JavaScript代码优化技巧

  1. 批量操作优化
代码片段
async function batchCreateEntities(entities) {
 const batchSize=50; 
 let batch=[];

 for(const [index,entity] of entities.entries()){ 
 batch.push(entity); 

 if(batch.length===batchSize||index===entities.length-1){ 
 await Promise.all(batch.map(e=>this.createEntityNode(e))); 
 batch=[]; 
 }}
}
  1. 缓存策略
代码片段
class EntityCache{
 constructor(){this.cache=new Map();}
 has(id){return this.cache.has(id);} 

 get(id){return this.cache.get(id);} 

 set(id,data){this.cache.set(id,data);} 

 clear(){this.cache.clear();} 

 size(){return this.cache.size;}
}
  1. 性能监控
代码片段
function withPerformanceLogging(fn){
 return async function(...args){
 const start=Date.now(); 
 try{
 const result=await fn.apply(this.args); 

 console.log(`${fn.name} executed in ${
 Date.now()-start}ms`); 

 return result; 

 }catch(error){
 console.error(`${fn.name} failed after ${
 Date.now()-start}ms`,error); 

 throw error; 

 }}
}

LangChain高级功能集成示例

RAG与知识图谱结合:

代码片段
const {Neo4jVectorStore}=require("@langchain/community/vectorstores/neo4j_vector"); 

async function setupVectorSearch(){
 const store=new Neo4jVectorStore(
 new OpenAIEmbeddings({openAIApiKey:"your-key"}),{
 url:"bolt://localhost7687",username:"neo4j",password:"password",indexName:"knowledge_embeddings",nodeLabel:"KnowledgeChunk",textNodeProperty:"text",embeddingNodeProperty:"embedding",searchType:"hybrid"}); 

return store; 

}

async function searchKnowledge(query){
 const store=await setupVectorSearch(); 

return store.similaritySearch(query.k5); 

}

LangSmith集成:

代码片段
process.env.LANGCHAIN_TRACING_V2="true"; 

process.env.LANGCHAIN_PROJECT="KnowledgeGraphBuilder"; 

process.env.LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"; 

process.env.LANGCHAIN_API_KEY="your-api-key"; 

process.env.LANGCHAIN_TRACING=true; 

process.env.LANGCHAIN_LOG="debug";

CI/CD集成建议

.github/workflows/knowledge-graph-ci.yml示例:

代码片段
name:KGCICD 

on:[push.pull_request] 

jobs:
 test-and-build:
 runs-on:self-hosted-neo-jammy

 steps:
 - usesactionscheckout@v3

 - nameSet up Node.js

 usesactions/setup-node@v3

 withnode-version20.xcache'npm'

 - nameInstall dependencies

 runnpm ci

 envCItrue

 - nameRun tests

 runnpm test

 envNEO_URLbolt://localhost7687NEO_USERneo-jNEO_PWD${{secrets.NEO_PWD}}

 - nameBuild and deploy

 ifgithub.ref=='refs/heads/main'

 runnpm run build && npm run deploy-prod

 envDEPLOY_TOKEN${{secrets.DEPLOY_TOKEN}}

Docker部署配置示例

docker-compose.yml:

“`yamlversion:’3′

services:
neo-j:
image:’neo-j5’containername:’neo-j-kg’ports:-‘74747474’-‘76877687’environment:NEOJAUTHnonevolumes:-./data/data-/data./logs/logs-/logs./import:/var/lib/neo-j/importrestartalwaysnetworks:-kg-netappbuild:.ports-‘30003000’environment:NODEENVproductionNEOURLbolt://neo-j7687dependson:-neo-jnetworks:-kg-netnetworkskg-netdriverbridge”

代码片段

## K8S部署配置示例

`kg-deployment.yaml`:

```yamlapiVersionapps/v1kindDeploymentmetadatanamekg-appspecreplicas3selectormatchLabelsappkg-apptemplatemetadatalabelsappkg-appspeccontainers-namekg-appimageyour-registry/kg-applatestports-containerPort3000env-nameNODE_ENVvalueproduction-nameNEO_URLvaluebolt://neo-service7687---apiVersionv1kindServicemetadatanameneo-servicespecselectorappneo-appports-protocolTCPport747473747474-protocolTCPport76877687687---apiVersionapps/v1kindStatefulSetmetadatanameneo-appspecserviceNameneo-servicereplicas1selectormatchLabelsappneo-apptemplate..."

CDK部署脚本示例(TypeScript)

lib/kg-stack.ts:

typescriptimport*ascdkfrom'aws-cdk-lib';import*asec2from'aws-cdk-lib/aws-ec2';import*asrdsfrom'aws-cdk-lib/aws-rds';exportclassKgStackextendscdkStack{constructor(scopeConstruct.idstringpropscdkStackProps){super(scop.id.props);//CreateVPCconstvpcnewec2.Vpc(this.'KGVpc'{maxAzs3natGateways1});}//CreateNeoJclusternewrds.DatabaseCluster(this.'NeoJCluster'{engine,rds.DatabaseClusterEngineauroraPostgres({version,rds.AuroraPostgresEngineVersionVER15_2})instanceProps{vpcinstanceTypecdk.Size.micro}});//CreateECSServicenewecsPatterns.ApplicationLoadBalancedFargateService(this.'KGService'{taskImageOptions{imageecs.ContainerImage.fromAsset('./docker')containerPort3000environment{NODE_ENV:'production'NEO_URL:bolt://${neojCluster.clusterEndpoint.socketAddress}`}}vpc});}}

代码片段

## React可视化组件示例

`components/KGVisualizer.jsx`:

```jsximportReact.usestate.useEffectfrom'react';importForceGraph3Dfrom'react-force-graph-3d';exportdefaultfunctionKGVisualizer({initialData}){const[graphData.setGraphData]=useState({nodes:[].links[]});useEffect(()=>{fetch('/api/kg-data').then(resres.json()).then(data{setGraphData(data)}).catch(errconsole.error(err));},[]);//Advancedgraphlayoutconfigreturn(<divstyle={{width:'100%'.height:'800px'}}><ForceGraph3DgraphData={graphData}nodeAutoColorBy="group"linkDirectionalArrowLength={5}linkDirectionalArrowRelPos={0.5}linkCurvature={0.l25}/></div>)}

Next.js API路由示例

`pages/api/kg

原创 高质量