<?xml version='1.0' encoding='UTF-8'  ?><rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/">	<channel rdf:about="http://www.semanlink.net/tag/vision_language_models">		<title>Vision Language Models</title>		<link>http://www.semanlink.net/tag/vision_language_models</link>		<description>Documents tagged with Vision Language Models</description>		<items>			<rdf:Seq>							<rdf:li resource="http://www.semanlink.net/doc/2025/10/lightonocr_1b_the_case_for_end"/>				<rdf:li resource="http://www.semanlink.net/doc/2025/08/en_python_comment_creer_une_im"/>				<rdf:li resource="http://www.semanlink.net/doc/2025/07/manuel_faysse_sur_x_introduc"/>				<rdf:li resource="http://www.semanlink.net/doc/2025/01/llamaindex_%F0%9F%A6%99_sur_x_we_ve_re"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/12/tony_wu_sur_x_colpali_is_now"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/11/beyond_text_the_rise_of_vision"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/10/tonywu71_colpali_cookbooks_rec"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/10/%F0%9F%8D%84_colpali_document_retrieval_"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/10/so_yeon_tiffany_min_sur_x_"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/09/2406_11251_unifying_multimoda"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/09/colpali_revolutionizing_multi"/>				<rdf:li resource="http://www.semanlink.net/doc/2024/09/2407_01449_colpali_efficient"/>			</rdf:Seq>		</items>	</channel>		<item rdf:about="http://www.semanlink.net/doc/2025/10/lightonocr_1b_the_case_for_end">		<title>LightOnOCR-1B: The Case for End-to-End and Efficient Domain-Specific Vision-Language Models for OCR</title>		<link>http://www.semanlink.net/doc/2025/10/lightonocr_1b_the_case_for_end</link>		<dc:date>2025-10-23T18:32:40Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2025/08/en_python_comment_creer_une_im">		<title>en python, comment créer une image à partir d&apos;une page de site web</title>		<link>http://www.semanlink.net/doc/2025/08/en_python_comment_creer_une_im</link>		<dc:date>2025-08-27T14:27:03Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2025/07/manuel_faysse_sur_x_introduc">		<title>Manuel Faysse sur X : &quot;Introducing ColQwen-Omni, a 3B omnimodal retriever that extends the ColPali concept of multimodal retrieval with late interaction to audio chunks and short videos...&quot;</title>		<link>http://www.semanlink.net/doc/2025/07/manuel_faysse_sur_x_introduc</link>		<description>&gt; The model is trained solely on visual document retrieval. Based on the Qwen-Omni backbone, it is able to generalize its newly learned document embedding capabilities to embed audios and short videos without ever having seen those during contrastive training!

&gt; We can thus do &quot;**Retrieval in Audio Space**&quot;. It only takes 10s to embed 30 minutes of audio.
&gt; This is orders of magnitude faster than indexing based on STT transcriptions		</description>		<dc:date>2025-07-17T21:35:10Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2025/01/llamaindex_%F0%9F%A6%99_sur_x_we_ve_re">		<title>LlamaIndex 🦙 sur X : &quot;a new multilingual, open-source visual embedding model and training set on Huggingface...&quot;</title>		<link>http://www.semanlink.net/doc/2025/01/llamaindex_%F0%9F%A6%99_sur_x_we_ve_re</link>		<dc:date>2025-01-11T14:40:36Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/12/tony_wu_sur_x_colpali_is_now">		<title>Tony Wu sur X : &quot;ColPali is now live in 🤗 transformers!&quot;</title>		<link>http://www.semanlink.net/doc/2024/12/tony_wu_sur_x_colpali_is_now</link>		<dc:date>2024-12-18T18:06:19Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/11/beyond_text_the_rise_of_vision">		<title>Beyond Text: The Rise of Vision-Driven Document Retrieval for RAG | Vespa Blog</title>		<link>http://www.semanlink.net/doc/2024/11/beyond_text_the_rise_of_vision</link>		<dc:date>2024-11-10T10:15:36Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/10/tonywu71_colpali_cookbooks_rec">		<title>tonywu71/colpali-cookbooks: Recipes for learning, fine-tuning, and adapting ColPali to your multimodal RAG use cases. 👨🏻‍🍳</title>		<link>http://www.semanlink.net/doc/2024/10/tonywu71_colpali_cookbooks_rec</link>		<dc:date>2024-10-19T07:26:16Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/10/%F0%9F%8D%84_colpali_document_retrieval_">		<title>🍄 ColPali: Document Retrieval with Vision Language Models</title>		<link>http://www.semanlink.net/doc/2024/10/%F0%9F%8D%84_colpali_document_retrieval_</link>		<dc:date>2024-10-06T23:19:52Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/10/so_yeon_tiffany_min_sur_x_">		<title>So Yeon (Tiffany) Min sur X : &quot;Embodied-RAG, a General Non-Parametric Method for Retrieval and Generation...&quot;</title>		<link>http://www.semanlink.net/doc/2024/10/so_yeon_tiffany_min_sur_x_</link>		<description>&gt; A new framework that equips embodied agents with a non-parametric memory capable of autonomously constructing hierarchical knowledge for navigation and language generation.
([Ruslan Salakhutdinov&#93;(tag:ruslan_salakhutdinov) [tweet&#93;(https://x.com/rsalakhu/status/1842694504387916073))

&gt; Hi robot, I&apos;m dehydrated, can you take me somewhere?

&gt; How to apply non-parametric memory to every day experiences?

&gt; key challenges in building embodied memory
&gt; - Dense memory that logs everything is memory inefficient.
&gt; - Space is continuous, and locations are spatially correlated, in contrast to independent documents in the text domain.

&gt; During the retrieval/generation phase, we select K &quot;chains&quot; (a leaf node all the way up to the root node), that are closest to the query.		</description>		<dc:date>2024-10-06T10:18:51Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/09/2406_11251_unifying_multimoda">		<title>[2406.11251&#93; Unifying Multimodal Retrieval via Document Screenshot Embedding</title>		<link>http://www.semanlink.net/doc/2024/09/2406_11251_unifying_multimoda</link>		<description>&gt; Document Screenshot Embedding} (DSE), a novel retrieval paradigm that regards document screenshots as a unified input format, which does not require any content extraction preprocess and preserves all the information in a document (e.g., text, image and layout)		</description>		<dc:date>2024-09-25T15:58:45Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/09/colpali_revolutionizing_multi">		<title>ColPali — Revolutionizing multimodal document retrieval | by Simeon Emanuilov | Sep, 2024 | Medium</title>		<link>http://www.semanlink.net/doc/2024/09/colpali_revolutionizing_multi</link>		<dc:date>2024-09-20T23:16:53Z</dc:date>	</item>	<item rdf:about="http://www.semanlink.net/doc/2024/09/2407_01449_colpali_efficient">		<title>[2407.01449&#93; ColPali: Efficient Document Retrieval with Vision Language Models</title>		<link>http://www.semanlink.net/doc/2024/09/2407_01449_colpali_efficient</link>		<dc:date>2024-09-07T13:56:46Z</dc:date>	</item></rdf:RDF>