@prefix rdf:   <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix sl:    <http://www.semanlink.net/2001/00/semanlink-schema#> .
@prefix skos:  <http://www.w3.org/2004/02/skos/core#> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix tag:   <http://www.semanlink.net/tag/> .
@prefix foaf:  <http://xmlns.com/foaf/0.1/> .
@prefix dc:    <http://purl.org/dc/elements/1.1/> .

<http://www.semanlink.net/doc/2019/09/machine_learning_for_humans_pa>
        dc:title         "Machine Learning for Humans, Part 5: Reinforcement Learning" ;
        sl:creationDate  "2019-09-23" ;
        sl:tag           tag:reinforcement_learning .

tag:chain_of_thought  a  sl:Tag ;
        skos:prefLabel  "Chain-of-thought" .

<http://www.semanlink.net/doc/2023/04/rl_for_llms_md>
        dc:title         "Reinforcement Learning for Language Models" ;
        sl:comment       "> I was puzzled for a while as to why we need RL for LM training, rather than just using supervised instruct tuning. I now have a convincing argument, which is also reflected in a recent talk by @johnschulman2.\r\n\r\n1st convincing argument:\r\n\r\n> supervised learning allows only positive feedback (we show the model a series of questions and their correct answers) while **RL allows also for negative feedback** (the model is allowed to generate an answer an get a feedback saying \"this is not correct\")...if you as a learner are allowed to form your own hypotheses and ask the teacher if they are correct (as in the RL setting), even an adversarial teacher can no longer trick you into latching on to a wrong hypothesis.\r\n\r\n2nd convincing argument is about knowledge-seeking queries\r\n\r\n> we want to encourage the model to answer based on its internal knowledge, but we don't know what this internal knowledge contains. In supervised training, we present the model with a question and its correct answer, and train the model to replicate the provided answer... But if we are succeed in training the model to generalize in [the cases it doesn't know], then we essentially teaches the model to make stuff up! it actively encourages the model to \"lie\"." ;
        sl:creationDate  "2023-04-23" ;
        sl:tag           tag:yoav_goldberg , tag:reinforcement_learning_from_human_feedback , tag:reinforcement_learning , tag:chatgpt .

<http://www.semanlink.net/doc/2023/05/peter_j_liu_sur_twitter_her>
        dc:title         "Peter J. Liu sur Twitter : \"RLHF-alternative without RL\" " ;
        sl:comment       "> TL;DR: Works as well as RLHF, but a lot simpler. About as easy and efficient as fine-tuning. Much better than simply fine-tuning on good examples." ;
        sl:creationDate  "2023-05-18" ;
        sl:tag           tag:tweet , tag:reinforcement_learning_from_human_feedback , tag:reinforcement_learning , tag:nlp_google , tag:fine_tuning , tag:alternative_way .

tag:football  a         sl:Tag ;
        skos:prefLabel  "Football" .

tag:learning_by_imitation
        a               sl:Tag ;
        skos:prefLabel  "Learning by imitation" .

tag:google_deepmind  a  sl:Tag ;
        skos:prefLabel  "DeepMind" .

tag:nlp_google  a       sl:Tag ;
        skos:prefLabel  "NLP@Google" .

tag:france_is_ai_2018
        a               sl:Tag ;
        skos:prefLabel  "France is AI 2018" .

<http://www.wildml.com/2018/02/introduction-to-learning-to-trade-with-reinforcement-learning/>
        dc:title         "Introduction to Learning to Trade with Reinforcement Learning – WildML" ;
        sl:creationDate  "2018-02-11" ;
        sl:tag           tag:reinforcement_learning , tag:denny_britz , tag:bourse , tag:bitcoin .

tag:self_supervised_learning
        a               sl:Tag ;
        skos:prefLabel  "Self-Supervised Learning" .

tag:reasoning_models_math_evals
        a               sl:Tag ;
        skos:prefLabel  "Reasoning models: math evals" .

<https://eng.uber.com/deep-neuroevolution/>
        dc:title         "Welcoming the Era of Deep Neuroevolution - Uber Engineering Blog" ;
        sl:comment       "> a suite of five papers that support the emerging realization that neuroevolution, where neural networks are optimized through evolutionary algorithms, is also an effective method to train deep neural networks for reinforcement learning (RL) problems." ;
        sl:creationDate  "2017-12-19" ;
        sl:tag           tag:uber , tag:reinforcement_learning , tag:neuroevolution , tag:evolutionary_algorithm , tag:deep_learning .

<https://blog.openai.com/evolution-strategies/>
        dc:title         "Evolution Strategies as a Scalable Alternative to Reinforcement Learning" ;
        sl:creationDate  "2018-01-06" ;
        sl:tag           tag:reinforcement_learning , tag:neuroevolution .

<https://blog.insightdatascience.com/reinforcement-learning-from-scratch-819b65f074d8>
        dc:title         "Reinforcement Learning from scratch – Insight Data" ;
        sl:creationDate  "2018-06-09" ;
        sl:tag           tag:reinforcement_learning .

tag:bitcoin  a          sl:Tag ;
        skos:prefLabel  "Bitcoin" .

<https://deepmind.com/blog/learning-to-generate-images/>
        dc:title         "Learning to write programs that generate images | DeepMind" ;
        sl:comment       "This ability to interpret objects through the tools that created them gives us a richer understanding of the world and is an important aspect of our intelligence." ;
        sl:creationDate  "2018-03-28" ;
        sl:tag           tag:reinforcement_learning , tag:google_deepmind .

tag:meta_reinforcement_learning
        a               sl:Tag ;
        skos:broader    tag:reinforcement_learning ;
        skos:prefLabel  "Meta Reinforcement Learning" .

tag:deep_learning  a    sl:Tag ;
        skos:prefLabel  "Deep Learning" .

tag:fine_tuning  a      sl:Tag ;
        skos:prefLabel  "Fine-tuning" .

<https://sermanet.github.io/imitate/>
        dc:title         "Time-Contrastive Networks: Self-Supervised Learning from Video (2017)" ;
        sl:comment       "Self-supervised approach for learning representations and robotic behaviors entirely from unlabeled videos recorded from multiple viewpoints, and study how this representation can be used in two robotic imitation settings: imitating object interactions from videos of humans, and imitating human poses. \r\n\r\n> We train our representations using a metric learning loss, where multiple simultaneous viewpoints of the same observation are attracted in the embedding space, while being repelled from temporal neighbors which are often visually similar but functionally different. In other words, the model simultaneously learns to recognize what is common between different-looking images, and what is different between similar-looking images.\r\n> This signal causes our model to discover attributes that do not change across viewpoint, but do change across time, while ignoring nuisance variables such as occlusions, motion blur, lighting and background. We demonstrate that this representation can be used by a robot to directly mimic human poses without an explicit correspondence, and that it can be used as a reward function within a reinforcement learning algorithm." ;
        sl:creationDate  "2018-10-27" ;
        sl:tag           tag:self_supervised_learning , tag:robotic_imitation , tag:reinforcement_learning , tag:learning_by_imitation , tag:google_brain , tag:france_is_ai_2018 .

tag:alternative_way  a  sl:Tag ;
        skos:prefLabel  "Alternative way" .

tag:robotique  a        sl:Tag ;
        skos:prefLabel  "Robotique" .

tag:reinforcement_learning
        a                 sl:Tag ;
        rdfs:isDefinedBy  tag:reinforcement_learning.n3 ;
        sl:comment        "An area of machine learning inspired by behaviorist psychology, concerned with how software agents ought to take actions in an environment so as to maximize some notion of cumulative reward." ;
        skos:broader      tag:machine_learning_techniques ;
        skos:prefLabel    "Reinforcement learning" ;
        skos:related      tag:google_deepmind ;
        foaf:page         tag:reinforcement_learning.html .

<http://www.semanlink.net/doc/2023/04/aran_komatsuzaki_sur_twitter__1>
        dc:title         "Aran Komatsuzaki sur Twitter : \"Learning Agile Soccer Skills for a Bipedal Robot with Deep Reinforcement Learning\"" ;
        sl:creationDate  "2023-04-27" ;
        sl:tag           tag:tweet , tag:robotique , tag:reinforcement_learning , tag:football .

tag:google_brain  a     sl:Tag ;
        skos:prefLabel  "Google Brain" .

tag:developmental_reinforcement_learning
        a               sl:Tag ;
        skos:broader    tag:reinforcement_learning ;
        skos:prefLabel  "Developmental reinforcement learning" .

tag:evolutionary_algorithm
        a               sl:Tag ;
        skos:prefLabel  "Evolutionary algorithm" .

tag:deep_learning_implementing
        a               sl:Tag ;
        skos:prefLabel  "Deep learning: implementing" .

<http://amid.fish/reproducing-deep-rl>
        dc:title         "Lessons Learned Reproducing a Deep Reinforcement Learning Paper" ;
        sl:creationDate  "2018-04-10" ;
        sl:tag           tag:reinforcement_learning , tag:deep_learning_implementing .

<http://www.semanlink.net/doc/2022/10/prithviraj_raj_ammanabrolu_su>
        dc:title         "Prithviraj (Raj) Ammanabrolu sur Twitter : \"The secret to aligning LMs to human preferences is reinforcement learning. ...\"" ;
        sl:creationDate  "2022-10-06" ;
        sl:tag           tag:tweet , tag:reinforcement_learning , tag:language_model .

tag:neuroevolution  a   sl:Tag ;
        skos:prefLabel  "Neuroevolution" .

tag:bourse  a           sl:Tag ;
        skos:prefLabel  "Bourse" .

tag:deepseek  a         sl:Tag ;
        skos:prefLabel  "DeepSeek" .

<http://www.semanlink.net/doc/2025/02/diffuse_one>
        dc:title         "diffuse.one/reasoning_update_0" ;
        sl:comment       "> There is an emerging pattern of fine-tuning a small language model followed by reinforcement learning.\r\n\r\n> A reasoning model is a large language model that is trained to output both a chain of thought and a response. The chain of thought should be relatively long (\r\n> 1,000 tokens) and the reasoning should improve its performance relative to a similar-sized non-reasoning models. This is sometimes called \"test-time\" or \"inference-time\" scaling because reasoning models emit more tokens per completion and gain some performance as a result." ;
        sl:creationDate  "2025-02-24" ;
        sl:tag           tag:reinforcement_learning , tag:reasoning_models , tag:reasoning_models_math_evals , tag:knowledge_distillation , tag:deepseek_r1 , tag:chain_of_thought .

<http://www.semanlink.net/doc/2025/02/deepseek_r1_model_by_deepseek_a>
        dc:title         "deepseek-r1 Model by Deepseek-ai | NVIDIA NIM" ;
        sl:comment       "> DeepSeek-R1 is a first-generation **reasoning model trained using large-scale reinforcement learning** (RL) to solve complex reasoning tasks across domains such as math, code, and language. The model leverages RL to develop reasoning capabilities, which are further enhanced through supervised fine-tuning (SFT) to improve readability and coherence." ;
        sl:creationDate  "2025-02-24" ;
        sl:tag           tag:reinforcement_learning , tag:reasoning_models , tag:deepseek_r1 , tag:deepseek .

tag:machine_learning_techniques
        a               sl:Tag ;
        skos:prefLabel  "Machine learning: techniques" .

tag:deepseek_r1  a      sl:Tag ;
        skos:broader    tag:reinforcement_learning ;
        skos:prefLabel  "Deepseek-r1" .

tag:chatgpt  a          sl:Tag ;
        skos:prefLabel  "ChatGPT" .

tag:yoav_goldberg  a    sl:Tag ;
        skos:prefLabel  "Yoav Goldberg" .

tag:robotic_imitation
        a               sl:Tag ;
        skos:prefLabel  "Robotic imitation" .

tag:uber  a             sl:Tag ;
        skos:prefLabel  "Uber" .

tag:knowledge_distillation
        a               sl:Tag ;
        skos:prefLabel  "Knowledge distillation" .

tag:alphago  a          sl:Tag ;
        skos:prefLabel  "Alphago" .

<https://spinningup.openai.com/en/latest/spinningup/keypapers.html>
        dc:title         "Key Papers in Deep RL — OpenAI - Spinning Up documentation" ;
        sl:creationDate  "2018-11-09" ;
        sl:tag           tag:reinforcement_learning .

<https://deepmind.com/blog/alphago-zero-learning-scratch/>
        dc:title         "AlphaGo Zero: Learning from scratch | DeepMind" ;
        sl:creationDate  "2017-10-18" ;
        sl:tag           tag:reinforcement_learning , tag:alphago .

tag:language_model  a   sl:Tag ;
        skos:prefLabel  "Language Model" .

tag:tweet  a            sl:Tag ;
        skos:prefLabel  "Tweet" .

tag:reasoning_models  a  sl:Tag ;
        skos:prefLabel  "Reasoning models (Inference-time scaling)" .

tag:denny_britz  a      sl:Tag ;
        skos:prefLabel  "Denny Britz" .

tag:reinforcement_learning_from_human_feedback
        a               sl:Tag ;
        skos:broader    tag:reinforcement_learning ;
        skos:prefLabel  "RL from Human Feedback" .
