from haystack_brightdata import BrightDataWebScraperdatasets = BrightDataWebScraper.get_supported_datasets()print(f"Total available datasets: {len(datasets)}\n")print("Sales research relevant datasets:")print("-" * 50)relevant_keywords = ["linkedin", "crunchbase", "company", "profile"]for dataset in datasets: if any(keyword in dataset['id'].lower() for keyword in relevant_keywords): print(f" {dataset['id']}") print(f" {dataset['description']}\n")
import jsoncompany_url = "https://www.crunchbase.com/organization/openai"def coalesce(data, *keys, default="N/A"): for key in keys: value = data.get(key) if value not in (None, "", [], {}): return value return defaultdef format_industries(industries): if not industries: return "N/A" if isinstance(industries, list): values = [] for item in industries: if isinstance(item, dict): value = item.get("value") or item.get("name") or item.get("id") if value: values.append(value) else: values.append(str(item)) return ", ".join(values) if values else "N/A" return industriesdef parse_company(result): raw = result.get("data", result) if isinstance(raw, str): raw = json.loads(raw) if isinstance(raw, list): return raw[0] if raw else {} if isinstance(raw, dict): return raw return {}result = scraper.run( dataset="crunchbase_company", url=company_url)company_data = parse_company(result)industries = format_industries(company_data.get("industries"))print(f"Company: {coalesce(company_data, 'name', 'legal_name')}")print(f"Overview: {coalesce(company_data, 'about', 'company_overview')}")print(f"Industries: {industries}")print(f"Operating Status: {coalesce(company_data, 'operating_status')}")print(f"Website: {coalesce(company_data, 'website', 'url')}")print(f"Employees: {coalesce(company_data, 'num_employees', 'number_of_employee_profiles')}")
预期输出:
Company: OpenAIOverview: OpenAI is an AI research and deployment company that develops advanced AI models, including ChatGPT.Industries: Agentic AI, Artificial Intelligence (AI), Generative AI, Machine Learning, SaaSOperating Status: activeWebsite: https://www.openai.comEmployees: 1001-5000
Name: Satya NadellaPosition: Chairman and CEO at MicrosoftLocation: Redmond, Washington, United States, USCurrent Company: MicrosoftFollowers: 11816477Connections: 500Experience (5 roles): 1. Chairman and CEO at Microsoft (N/A) 2. Member Board Of Trustees at University of Chicago (N/A) 3. Board Member at Starbucks (N/A)
from haystack import Pipelinefrom haystack.components.builders import ChatPromptBuilderfrom haystack.dataclasses import ChatMessagefrom haystack_integrations.components.embedders.google_genai import GoogleGenAITextEmbedderfrom haystack_integrations.components.generators.google_genai import GoogleGenAIChatGeneratorsystem_message = ChatMessage.from_system("""You are a sales intelligence assistant. Your role is to analyze company and people data to provide actionable sales intelligence.When answering queries:- Cite specific company names and details from the data- Provide insights relevant for sales outreach- Highlight key information like funding, company size, location, recent news- Suggest talking points for personalized outreach""")user_template = """Based on the following company/person data, answer the user's question.Context:{% for document in documents %}{{ document.content }}---{% endfor %}Question: {{ question }}Provide a detailed, actionable answer based on the retrieved data."""user_message = ChatMessage.from_user(user_template)rag_pipeline = Pipeline()rag_pipeline.add_component("text_embedder", GoogleGenAITextEmbedder(model="text-embedding-004"))rag_pipeline.add_component("retriever", MongoDBAtlasEmbeddingRetriever(document_store=document_store, top_k=5))rag_pipeline.add_component("prompt_builder", ChatPromptBuilder(template=[system_message, user_message]))rag_pipeline.add_component("generator", GoogleGenAIChatGenerator(model="gemini-2.5-flash"))rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")rag_pipeline.connect("retriever.documents", "prompt_builder.documents")rag_pipeline.connect("prompt_builder.prompt", "generator.messages")print("RAG pipeline created")print(" Question → Text Embedder → Retriever → Prompt Builder → Generator → Answer")
question = "What can you tell me about OpenAI? Include details about their industry, products, and any relevant information for sales outreach."result = rag_pipeline.run( data={ "text_embedder": {"text": question}, "prompt_builder": {"question": question} }, include_outputs_from={"retriever"})answer = result["generator"]["replies"][0].textprint(answer)# 显示检索到的文档if "retriever" in result: retrieved_docs = result["retriever"]["documents"] print(f"\nRetrieved {len(retrieved_docs)} relevant documents from MongoDB") for i, doc in enumerate(retrieved_docs, 1): print(f"\nDocument {i}:") print(f" Company: {doc.meta.get('company_name', 'N/A')}") print(f" Source: {doc.meta.get('dataset_type', 'N/A')}") print(f" Location: {doc.meta.get('location', 'N/A')}") print(f" Industry: {doc.meta.get('industry', 'N/A')}") print(f" Content: {doc.content[:300]}...")