instructor
152 строки · 5.2 Кб
1from openai import OpenAI2from io import StringIO3from typing import Annotated, Any4from pydantic import (5BaseModel,6BeforeValidator,7PlainSerializer,8InstanceOf,9WithJsonSchema,10)
11import instructor12import pandas as pd13from rich.console import Console14
15console = Console()16client = instructor.from_openai(17client=OpenAI(),18mode=instructor.Mode.TOOLS,19)
20
21
22def md_to_df(data: Any) -> Any:23if isinstance(data, str):24return (25pd.read_csv(26StringIO(data), # Get rid of whitespaces27sep="|",28index_col=1,29)30.dropna(axis=1, how="all")31.iloc[1:]32.map(lambda x: x.strip())33) # type: ignore34return data35
36
37MarkdownDataFrame = Annotated[38InstanceOf[pd.DataFrame],39BeforeValidator(md_to_df),40PlainSerializer(lambda x: x.to_markdown()),41WithJsonSchema(42{43"type": "string",44"description": """45The markdown representation of the table,
46each one should be tidy, do not try to join tables
47that should be seperate""",48}49),50]
51
52
53class Table(BaseModel):54caption: str55dataframe: MarkdownDataFrame56
57
58class MultipleTables(BaseModel):59tables: list[Table]60
61
62example = MultipleTables(63tables=[64Table(65caption="This is a caption",66dataframe=pd.DataFrame(67{68"Chart A": [10, 40],69"Chart B": [20, 50],70"Chart C": [30, 60],71}72),73)74]75)
76
77
78def extract(url: str) -> MultipleTables:79return client.chat.completions.create(80model="gpt-4-turbo",81max_tokens=4000,82response_model=MultipleTables,83messages=[84{85"role": "user",86"content": [87{88"type": "image_url",89"image_url": {"url": url},90},91{92"type": "text",93"text": """94First, analyze the image to determine the most appropriate headers for the tables.
95Generate a descriptive h1 for the overall image, followed by a brief summary of the data it contains.
96For each identified table, create an informative h2 title and a concise description of its contents.
97Finally, output the markdown representation of each table.
98
99
100Make sure to escape the markdown table properly, and make sure to include the caption and the dataframe.
101including escaping all the newlines and quotes. Only return a markdown table in dataframe, nothing else.
102""",103},104],105}106],107)108
109
110urls = [111"https://a.storyblok.com/f/47007/2400x1260/f816b031cb/uk-ireland-in-three-charts_chart_a.png/m/2880x0",112"https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0",113]
114
115for url in urls:116for table in extract(url).tables:117console.print(table.caption, "\n", table.dataframe)118"""
119Growth in app installations and sessions across different app categories in Q3 2022 compared to Q2 2022 for Ireland and U.K.
120Install Growth (%) Session Growth (%)
121Category
122Education 7 6
123Games 13 3
124Social 4 -3
125Utilities 6 -0.4
126Top 10 Grossing Android Apps in Ireland, October 2023
127App Name Category
128Rank
1291 Google One Productivity
1302 Disney+ Entertainment
1313 TikTok - Videos, Music & LIVE Entertainment
1324 Candy Crush Saga Games
1335 Tinder: Dating, Chat & Friends Social networking
1346 Coin Master Games
1357 Roblox Games
1368 Bumble - Dating & Make Friends Dating
1379 Royal Match Games
13810 Spotify: Music and Podcasts Music & Audio
139Top 10 Grossing iOS Apps in Ireland, October 2023
140App Name Category
141Rank
1421 Tinder: Dating, Chat & Friends Social networking
1432 Disney+ Entertainment
1443 YouTube: Watch, Listen, Stream Entertainment
1454 Audible: Audio Entertainment Entertainment
1465 Candy Crush Saga Games
1476 TikTok - Videos, Music & LIVE Entertainment
1487 Bumble - Dating & Make Friends Dating
1498 Roblox Games
1509 LinkedIn: Job Search & News Business
15110 Duolingo - Language Lessons Education
152"""
153