<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e86545</article-id><article-id pub-id-type="doi">10.2196/86545</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Understanding User Intent in Code-Mixed Sexual and Reproductive Health Queries in Urban India: Hierarchical Classification Approach Using Large Language Models</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Dey</surname><given-names>Sumon Kanti</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>S</surname><given-names>Manvi</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Thapa</surname><given-names>Aradhana</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shah</surname><given-names>Meet</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mehta</surname><given-names>Zeel</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kapile</surname><given-names>Shraddha Kale</given-names></name><degrees>MSW</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Divate</surname><given-names>Tanvi</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jalota</surname><given-names>Suhani</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ismail</surname><given-names>Azra</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Informatics, School of Medicine, Emory University</institution><addr-line>101 Woodruff Circle</addr-line><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff2"><institution>Hubert Department of Global Health, Rollins School of Public Health, Emory University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><aff id="aff3"><institution>Myna Mahila Foundation</institution><addr-line>Mumbai</addr-line><country>India</country></aff><aff id="aff4"><institution>Hoover Institution, Stanford University</institution><addr-line>Stanford</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Singh</surname><given-names>Anil Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Brito</surname><given-names>Felipe T</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Borovic</surname><given-names>Mladen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sumon Kanti Dey, BSc, Department of Biomedical Informatics, School of Medicine, Emory University, 101 Woodruff Circle, Atlanta, GA, 30322, United States, 1 4046437205; <email>sumon.kanti.dey@emory.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>24</day><month>3</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e86545</elocation-id><history><date date-type="received"><day>30</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>18</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Sumon Kanti Dey, Manvi S, Aradhana Thapa, Meet Shah, Zeel Mehta, Shraddha Kale Kapile, Tanvi Divate, Suhani Jalota, Azra Ismail. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 24.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e86545"/><abstract><sec><title>Background</title><p>Sexual and reproductive health (SRH) remains a stigmatized and taboo topic globally, limiting access to reliable information. These challenges are heightened in the Global South, where linguistic and cultural diversity further complicates information access. In India (the study context), many individuals express SRH concerns in code-mixed language, such as Hinglish (code-mixed Hindi and English), and use colloquial terms. Large language models (LLMs) could help answer SRH questions, but most are trained for English and may perform poorly on code-mixed text and miss cultural nuances. Our research aims to address this gap by assessing the current state of LLMs in understanding user intent in SRH queries for a low-resource language.</p></sec><sec><title>Objective</title><p>We evaluate the effectiveness of proprietary, multilingual open-weight, and Indic LLMs in zero-shot settings for identifying user intent in code-mixed Hinglish SRH queries. Our goal is to assess how well LLMs assign correct labels in a 2-level hierarchical classification (topic and subtopic). We take a hierarchical approach because SRH queries are complex and context-dependent; flat labels may obscure clinically important distinctions and lead to misdirected guidance. We also characterize common error types driving misclassification.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed 4161 deidentified questions about SRH in Hinglish, collected by our partner nonprofit organization (Myna Mahila Foundation) in an underserved community in urban Mumbai. Queries were annotated into 8 topics and 40 subtopics using a hierarchical framework that captured linguistic, cultural, and contextual variation. We evaluated proprietary, multilingual open-weight, and Indic-specific LLMs in zero-shot settings. Performance was measured using hierarchical <italic>F</italic><sub>1</sub> (h<italic>F</italic><sub>1</sub>), Exact Match, and topic- and subtopic-level accuracy.</p></sec><sec sec-type="results"><title>Results</title><p>Proprietary models achieved the strongest results, with GPT-5 performing best overall (h<italic>F</italic><sub>1</sub>= 0.784). Among open-weight systems, Sarvam-M emerged as the top-performing Indic model (h<italic>F</italic><sub>1</sub>=0.757), ranking just below the top-performing proprietary model and performing comparably with Claude-3.5-Sonnet (0.745; Anthropic) as well as large multilingual systems such as Llama-3.3-70B-Instruct (0.742; Meta) and Gemma-3-27B-IT (0.739; Google). Other Indic models performed considerably lower (eg, Llama-3-Gaja-Hindi-8B [0.596; CognitiveLab], Krutrim-2-Instruct [0.558; OLA Krutrim Team], and Airavata [0.404; AI4Bharat]). Smaller multilingual open-weight models, including Mixtral-8 &#x00D7; 7B-Instruct (0.593), Llama-3.1-8B-Instruct (0.630), Gemma-2-9B-IT (0.657), consistently outperformed them, showing that parameter size alone does not explain performance gaps. While models generally captured broad topical intent, they frequently failed at fine-grained intent recognition, especially with euphemisms, colloquial expressions, and locally or culturally situated questions.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Hierarchical classification revealed persistent gaps in how LLMs handle code-mixed queries. Proprietary models performed best, but Sarvam-M shows that open-weight Indic systems can achieve performance near state-of-the-art models when supported by robust training data, cultural adaptation, and appropriate scale. These findings highlight the potential of localized, culturally aligned models to advance linguistically inclusive artificial intelligence tools and expand equitable access to SRH information in underserved populations globally.</p></sec></abstract><kwd-group><kwd>sexual and reproductive health</kwd><kwd>large language models</kwd><kwd>code-mixing</kwd><kwd>Hinglish</kwd><kwd>hierarchical classification</kwd><kwd>conversational agents</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Recent advancements in large language models (LLMs) present an opportunity to address significant gaps in health care information delivery. LLMs could be leveraged to simplify complex medical information, respond to patient queries, and enhance health literacy among the general population [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Despite these advancements, there remain significant disparities in the performance of these models across languages, especially in health care tasks [<xref ref-type="bibr" rid="ref3">3</xref>]. Most LLMs are predominantly centered on the English language [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. They can fail to recognize local dialects, cultural nuances, and speaking patterns, especially for non-English speaking populations that are less represented online. Health communication is further shaped by social dynamics, including gender, educational status, functional literacy, and cultural context [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Our research focuses on addressing these challenges in the context of India by evaluating the performance of LLMs in detecting user intent in the context of sexual and reproductive health (SRH).</p><p>SRH presents unique challenges for health information delivery, given that stigma, misinformation, and social barriers can restrict individuals&#x2014;especially women&#x2014;from seeking reliable health information and medical support [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. For instance, 78% of the 15 million abortions in India take place outside medical facilities, highlighting the need for better access to reproductive health services and information [<xref ref-type="bibr" rid="ref11">11</xref>]. Deep-rooted societal taboos also prevent open discussions about sex-related topics, exacerbating barriers to SRH awareness and services [<xref ref-type="bibr" rid="ref12">12</xref>]. At the same time, SRH is a time-sensitive domain in which delays can have serious consequences. In India, lack of early pregnancy care leads to undetected and unmanaged conditions such as anemia, diabetes, hypertension, and infections, which are significant causes of pregnancy loss and maternal mortality [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>While a multitude of online platforms exist where individuals can engage with health care professionals or even community members (eg, Reddit; Reddit, Inc), the reach of these platforms remains low in underserved communities in India. The absence of such inclusive platforms for linguistically diverse, resource-constrained communities further contributes to health inequities, leaving many women without access to critical SRH-related knowledge and services. Prior work by Wang et al [<xref ref-type="bibr" rid="ref15">15</xref>] has shown the potential of a rule-based conversational chatbot for SRH support among young people in India (SnehAI), demonstrating strong user engagement and information-seeking behavior. We build on this work by evaluating the potential of LLMs to support the understanding of user intent in code-mixed SRH queries for such interventions.</p><p>Adding to these challenges is the extensive linguistic diversity within India, which significantly influences communication patterns, particularly in informal and colloquial settings. A notable linguistic phenomenon prevalent across India is code-mixing, the practice of blending multiple languages within a single conversation or utterance. We focus on Hinglish&#x2014;a popular form of code-mixing involving Hindi and English, where individuals use the phonetic Latin script instead of the Devanagari script to write Hindi words [<xref ref-type="bibr" rid="ref16">16</xref>]. This was often the preferred mode of typing for the population on which we focus in this study and has been documented in prior research in other Hindi-speaking populations in India [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Code-mixing remains a long-standing challenge in natural language understanding research, with several publicly available LLMs still struggling to interpret and generate code-switched text [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>].</p><p>Another significant challenge is the inherently layered and context-dependent nature of questions that individuals frequently ask about SRH. Broader concerns related to pregnancy may branch into distinct subtopics such as antepartum emergency, postpartum pain, infertility, or abortion, reflecting the complex structure of real-world health inquiries. Traditional flat classification approaches may collapse these distinctions [<xref ref-type="bibr" rid="ref21">21</xref>], leading to the generation of misleading responses that may fail to capture critical aspects of care and support. Our research aims to address this gap by taking a hierarchical classification approach to understanding user intent in SRH queries (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p><bold>Hierarchical sexual and reproductive health (SRH) intent classification framework.</bold> This figure illustrates our approach to processing a code-mixed SRH user query (an English translation of the original Hinglish text is shown for clarity) using a large language model (LLM) to infer intent. The model maps the query to both a topic and its corresponding subtopic, producing structured intent classification. JSON: JavaScript Object Notation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86545_fig01.png"/></fig><p>In summary, this study contributes the following: (1) we designed a hierarchical intent structure for SRH queries with 2 levels (topic and subtopic) to capture user intent in code-mixed Hinglish, providing clear label descriptions to support consistent annotation and evaluation, (2) we evaluated several state-of-the-art proprietary and open-weight multilingual LLMs (including Indic LLMs) to measure how effectively they handle hierarchical intent classification of SRH-related queries expressed in Hinglish. This evaluation also provides insight into how well LLMs can handle code-mixed text, ensuring broader applicability in the real-world health care contexts, and (3) we conducted a qualitative error analysis with thematic coding across models to examine misclassifications in code-mixed SRH queries. We explain why these errors occur and how models misread culturally sensitive terms and euphemisms. Representative annotated examples from the dataset are provided in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Sample annotated user queries from the dataset. The examples below present both transliteration and code-mixing between Hindi and English.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Query</td><td align="left" valign="bottom">English translation</td><td align="left" valign="bottom">Topic</td><td align="left" valign="bottom">Subtopic</td></tr></thead><tbody><tr><td align="left" valign="top">Periods agar monthly na aaye to kya karen?</td><td align="left" valign="top">What should I do if my periods are not regular every month?</td><td align="left" valign="top">Menstrual Health</td><td align="left" valign="top">Menstrual Cycle<break/>Information</td></tr><tr><td align="left" valign="top">Gabhnirodhak goliyan lene se pehle mujhe doctor se kya puchna chahiye?</td><td align="left" valign="top">What should I ask the doctor before taking birth control pills?</td><td align="left" valign="top">Contraception and Family Planning</td><td align="left" valign="top">Usage Guidance</td></tr><tr><td align="left" valign="top">Agar PCOD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> ka 3 mahina dawa chalane ke baad bhi shi nahi hua to kya kare?</td><td align="left" valign="top">If PCOD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> does not improve after three months of medication, what should I do?</td><td align="left" valign="top">PCOS<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> or PCOD<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Management</td></tr><tr><td align="left" valign="top">Agar abhi bacha nahi rakhna hai aur fir bhi pregnant ho gaye to abortion ke liye kya kare?</td><td align="left" valign="top">If I do not want to have a baby right now but still get pregnant, what should I do for an abortion?</td><td align="left" valign="top">Pregnancy and PNC<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">Abortion</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>PCOD: polycystic ovarian disease.</p></fn><fn id="table1fn2"><p><sup>b</sup>PCOS: polycystic ovary syndrome.</p></fn><fn id="table1fn3"><p><sup>c</sup>PNC: postnatal care.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s1-2"><title>Related Work</title><p>Hierarchical classification is a method of assigning items to categories organized within a hierarchy of classes [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. This approach has been broadly used in a range of domains where layered structures are important, including e-commerce user query categorization [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], health care question answering [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], clinical guideline classification [<xref ref-type="bibr" rid="ref21">21</xref>], and personalized health care analysis of women&#x2019;s menstrual health disorders [<xref ref-type="bibr" rid="ref28">28</xref>]. Its advantages lie in its ability to model the inherent complexity of user intent, particularly in health care contexts where individuals begin their health care concerns by first referencing broader domains and then subsequently focusing on specific details or contexts. While flat classification systems are widely used as classification baselines, their lack of hierarchical awareness limits their effectiveness, especially in domains where understanding class structure is essential [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Although hierarchical classification has demonstrated clear benefits across diverse applications, its potential remains underexplored in the context of code-mixed languages and SRH, where understanding layered and implicit user intent is especially important.</p><p>The recent development of highly parameterized language models, such as GPT-5 (OpenAI) [<xref ref-type="bibr" rid="ref30">30</xref>], GPT-4o (OpenAI) [<xref ref-type="bibr" rid="ref31">31</xref>], Llama 3 (Meta) [<xref ref-type="bibr" rid="ref32">32</xref>], Gemma 3 (Google) [<xref ref-type="bibr" rid="ref33">33</xref>], among many others, has significantly enhanced health care reasoning, understanding, and summarization [<xref ref-type="bibr" rid="ref34">34</xref>]. These models can generate concise and user-friendly summaries and have the potential to make medical information more accessible to the general public. Despite these developments, a major limitation remains their reliance on training data in English, making them less effective for regional languages [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. For instance, Llama 3 sources 90% of its pretraining data from English sources [<xref ref-type="bibr" rid="ref4">4</xref>], while a substantial portion of GPT-4&#x2019;s pretraining data is similarly English-dominant [<xref ref-type="bibr" rid="ref5">5</xref>]. This linguistic bias creates information disparities across languages, where equivalent questions in different languages may produce inconsistent and inaccurate responses [<xref ref-type="bibr" rid="ref36">36</xref>]. Our study seeks to address this gap.</p><p>Recent research has increasingly focused on using chatbots to provide health care assistance, particularly around SRH. For example, Wang et al [<xref ref-type="bibr" rid="ref15">15</xref>] designed an SRH-focused rule-based chatbot on Facebook Messenger (Meta Platforms, Inc) to analyze how users engage with artificial intelligence (AI) to seek health information. Their study found that users frequently shared personal concerns and SRH-related queries in code-mixed languages, highlighting the need for linguistically adaptable health care AI models. There are several reasons why LLMs can be useful for SRH information delivery, including gaps in sexual education [<xref ref-type="bibr" rid="ref37">37</xref>] and societal stigma and taboo around sex-related topics [<xref ref-type="bibr" rid="ref12">12</xref>]. However, recent studies indicate that LLMs struggle with Indic languages, particularly when queries are code-mixed and culturally situated [<xref ref-type="bibr" rid="ref38">38</xref>]. To analyze these issues systematically, we collected a dataset called sexual and reproductive health queries (SRHQ)-India, which captures real-world Indian SRH user queries in a code-mixed format.</p><p>As queries in SRH domains are often linguistically and contextually diverse, the hierarchical categorization approach offers the potential for more accurate and interpretable results. Recent advancements in LLMs have made significant improvements in flat text classification, especially in the health care domain. In our work, we seek to evaluate LLMs&#x2019; understanding of the hierarchical classification task. In this regard, we explore the use of a zero-shot learning technique that involves crafting prompts to enable language models to generate useful responses without prior examples, relying entirely on pretrained knowledge to tackle the new task. This technique has been used in several health care applications, such as capturing the context of clinical text [<xref ref-type="bibr" rid="ref39">39</xref>] and classifying health care queries [<xref ref-type="bibr" rid="ref40">40</xref>]. In our work, we leverage zero-shot learning to evaluate the efficiency of language models in processing code-mixed user queries.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>In this study, we used a dataset gathered by the Myna Mahila Foundation, a nonprofit women&#x2019;s health organization in India. The data were collected through a preliminary SRH chatbot prototype that was developed and piloted with 488 women in informal settlements of Mumbai, India, from October 2023 to December 2024 to capture real-world user queries on SRH, including both text and voice input. This is part of their ongoing effort to improve access to SRH information for women and girls in urban slums. Below in &#x201C;Dataset Generation and Curation&#x201D; and &#x201C;Data Annotation&#x201D; subheadings, we detail our approach to structuring and annotating the dataset. We also outline our evaluation framework for model performance.</p></sec><sec id="s2-2"><title>Dataset Generation and Curation</title><p>The dataset originally contained 4858 queries and was refined to reduce redundancy using cosine similarity computed over term frequency&#x2013;inverse document frequency representations of the text. Queries were first normalized using standard text preprocessing, including Unicode correction, lowercasing, whitespace normalization, and removal of basic punctuation. Cosine similarity was then used to identify syntactically similar queries, and pairs with a similarity score of 90% or higher were considered near duplicates, and only one was retained. For example, <italic>&#x201C;</italic>kya saheli tablet lene se periods ka date badal jata hai?<italic>&#x201D;</italic> or &#x201C;saheli tablet se periods ka date badal jata hai kya<italic>&#x201D;</italic> convey the same meaning: &#x201C;Does taking the Saheli tablet (a nonhormonal contraceptive pill) change the period date?&#x201D; Term frequency&#x2013;inverse document frequency&#x2013;based lexical similarity was chosen instead of pretrained sentence embeddings because many widely used embedding models are trained predominantly on monolingual or English-centric data and perform poorly on code-mixed Indian languages [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. Moreover, our aim was to remove lexically rephrased queries rather than infer semantic equivalence. In addition, we removed queries containing fewer than three words to exclude vague or incomplete inputs. We also filtered out greeting-based queries, such as <italic>&#x201C;</italic>Namaste<italic>&#x201D;</italic> or &#x201C;Aap kaise hai,&#x201D; which means &#x201C;How are you,&#x201D; which do not contribute to intent classification.</p><p>After filtering, we obtained a final dataset of 4161 queries, primarily in Hinglish. The dataset consists of queries with misspelled English words, transliterated Hindi terms, and borrowed English medical words that are common in everyday SRH talk (eg, periods, condom, pregnancy, in-vitro fertilization [IVF], and polycystic ovary syndrome). We also observed queries related to cultural practices or religious beliefs. For example, &#x201C;Masik pali aane se Bhagvan ke pas kau nahi jana chahiye?&#x201D; which in English translates to &#x201C;Why should one avoid going to god or temples during periods?&#x201D; This query centers around the religious norm of how women are advised not to visit the temple during their periods in certain cultures and communities in India. Additionally, the query <italic>&#x201C;</italic>Det ke pahele piryat q aata hai<italic>&#x201D;</italic> translates to &#x201C;Why periods come before the date?&#x201D; where det (date) and piryat (period) are misspelled terms. Such variations highlight the real-world linguistic challenges in SRH-related conversations and underscore the need for models capable of accurately processing noisy, code-mixed, and culturally or religiously influenced queries.</p><p>Beyond linguistic complexities, the dataset also captures deeply ingrained cultural myths and misinformation, particularly related to pregnancy and gender beliefs. For instance, the query <italic>&#x201C;</italic>Main pregnant hu or jinko sirf ladkiya hai kya unko dekte rahne se kya muje ladki hogi?<italic>&#x201D;</italic> translates to &#x201C;I am pregnant, if I keep seeing women who have only given birth to daughters, will I also have a daughter?&#x201D; Similarly, <italic>&#x201C;</italic>Kya muje sirf ladka hi chahiye to uske liye khuch upay hai kya?<italic>&#x201D;</italic> means &#x201C;Is there any way to have only a boy?&#x201D; These queries reflect long-standing cultural expectations in some communities in India, where socioeconomic structures reinforce a preference for sons. Such myths not only influence reproductive decisions but also contribute to gender-based discrimination and misinformation.</p><p>We adopted a hierarchical thematic structure for annotation. Queries were first classified into broad topics and then further refined into subcategories to capture the complexity of SRH-related concerns. For example, questions related to postpartum, antepartum, and abortion fall under the broad category of Pregnancy and Postnatal Care (PNC) category, while questions related to the Menstrual Health category are further classified based on their context to subtopics, including Menstrual Cycle Information, Menstrual Flow, or Period Pain Management. We have also included Mental Health and Wellness, with subtopics including Stress Management and Safety Concerns, because stress, sleep, mood, and anxiety issues often co-occur with SRH questions and influence help-seeking and guidance needs. Placing these queries within the SRH hierarchy keeps related concerns together and improves intent interpretation. <xref ref-type="table" rid="table2">Table 2</xref> provides the annotation framework in detail. The annotation guidelines were developed through an iterative process of feedback from doctors and program staff at the Myna Mahila Foundation. This helped standardize interpretations and improve annotation reliability across the dataset. Key descriptive statistics of the dataset are provided in <xref ref-type="table" rid="table3">Table 3</xref>. In addition, we quantify the degree of intrasentential Hindi-English code-mixing using the Code-Mixing Index (CMI); dataset-level CMI statistics are reported in <xref ref-type="table" rid="table3">Table 3</xref>, and the metric is described in the following &#x201C;CMI Calculation&#x201D; subheading.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Annotated dataset distribution by topic and subtopic.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Topic and subtopic</td><td align="left" valign="bottom">Queries (N=4161), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Contraception and Family Planning</td><td align="char" char="parenthesis" valign="top">1417 (34.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Family Planning Queries</td><td align="char" char="parenthesis" valign="top">522 (36.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Usage Guidance</td><td align="char" char="parenthesis" valign="top">291 (20.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Types of Contraceptives</td><td align="char" char="parenthesis" valign="top">209 (14.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sterilization</td><td align="char" char="parenthesis" valign="top">184 (13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Effectiveness and Duration</td><td align="char" char="parenthesis" valign="top">127 (9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Side Effects</td><td align="char" char="parenthesis" valign="top">84 (5.9)</td></tr><tr><td align="left" valign="top">Menstrual Health</td><td align="char" char="parenthesis" valign="top">1104 (26.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Menstrual Cycle Information</td><td align="char" char="parenthesis" valign="top">673 (61)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Period Pain Management</td><td align="char" char="parenthesis" valign="top">325 (29.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sanitary Products and Hygiene</td><td align="char" char="parenthesis" valign="top">62 (5.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Menstrual Flow</td><td align="char" char="parenthesis" valign="top">44 (4)</td></tr><tr><td align="left" valign="top">Pregnancy and PNC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="char" char="parenthesis" valign="top">547 (13.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Pregnancy Information</td><td align="char" char="parenthesis" valign="top">290 (53)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Antepartum</td><td align="char" char="parenthesis" valign="top">120 (21.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Infertility</td><td align="char" char="parenthesis" valign="top">53 (9.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Abortion</td><td align="char" char="parenthesis" valign="top">26 (4.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Postpartum</td><td align="char" char="parenthesis" valign="top">23 (4.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Miscarriage</td><td align="char" char="parenthesis" valign="top">20 (3.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Breastfeeding</td><td align="char" char="parenthesis" valign="top">15 (2.7)</td></tr><tr><td align="left" valign="top">Sexual and Vaginal Health</td><td align="char" char="parenthesis" valign="top">322 (7.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sex-Related Queries</td><td align="char" char="parenthesis" valign="top">128 (39.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vaginal Health and Discharge</td><td align="char" char="parenthesis" valign="top">127 (39.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Reproductive Anatomy</td><td align="char" char="parenthesis" valign="top">26 (8.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Urinary Tract Infections (UTI)</td><td align="char" char="parenthesis" valign="top">16 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sexually Transmitted Infections (STI or STD)</td><td align="char" char="parenthesis" valign="top">15 (4.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vaginal or Uterine Infections</td><td align="char" char="parenthesis" valign="top">10 (3.1)</td></tr><tr><td align="left" valign="top">PCOS or PCOD<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="char" char="parenthesis" valign="top">101 (2.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Information</td><td align="char" char="parenthesis" valign="top">49 (48.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Management</td><td align="char" char="parenthesis" valign="top">28 (27.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Symptoms</td><td align="char" char="parenthesis" valign="top">24 (23.8)</td></tr><tr><td align="left" valign="top">HIV<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="char" char="parenthesis" valign="top">52 (1.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Stigma and Awareness</td><td align="char" char="parenthesis" valign="top">14 (26.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Treatment</td><td align="char" char="parenthesis" valign="top">14 (26.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prevention</td><td align="char" char="parenthesis" valign="top">12 (23.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Symptoms and Early Detection</td><td align="char" char="parenthesis" valign="top">12 (23.1)</td></tr><tr><td align="left" valign="top">Mental Health and Wellness</td><td align="char" char="parenthesis" valign="top">22 (0.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Stress Management</td><td align="char" char="parenthesis" valign="top">13 (59.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Information and Safety Concerns</td><td align="char" char="parenthesis" valign="top">9 (40.9)</td></tr><tr><td align="left" valign="top">Other</td><td align="char" char="parenthesis" valign="top">596 (14.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>General Health Queries</td><td align="char" char="parenthesis" valign="top">361 (60.6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diet and Nutrition</td><td align="char" char="parenthesis" valign="top">97 (16.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Exercise and Fitness</td><td align="char" char="parenthesis" valign="top">46 (7.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Health Equity and Access</td><td align="char" char="parenthesis" valign="top">38 (6.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Marriage and Relationships</td><td align="char" char="parenthesis" valign="top">21 (3.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Misconceptions and Myths</td><td align="char" char="parenthesis" valign="top">13 (2.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Child Health</td><td align="char" char="parenthesis" valign="top">12 (2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cultural, Religious, or Moral Norms</td><td align="char" char="parenthesis" valign="top">8 (1.3)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PNC: postnatal care. </p></fn><fn id="table2fn2"><p><sup>b</sup>PCOS: polycystic ovary syndrome.</p></fn><fn id="table2fn3"><p><sup>c</sup>PCOD: polycystic ovarian disease.</p></fn><fn id="table2fn4"><p><sup>d</sup>HIV: human immunodeficiency virus.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Statistics of the dataset.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Statistic</td><td align="left" valign="bottom">Values</td></tr></thead><tbody><tr><td align="left" valign="top">Total number of topics</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Total number of subtopics</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top">Total number of queries</td><td align="left" valign="top">4161</td></tr><tr><td align="left" valign="top">Maximum question length (words)</td><td align="left" valign="top">71 words</td></tr><tr><td align="left" valign="top">Average question length (words)</td><td align="left" valign="top">10.56 words</td></tr><tr><td align="left" valign="top">% queries in transliterated Hindi<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">61.53</td></tr><tr><td align="left" valign="top">% queries in intrasentential mixing<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="top">38.37</td></tr><tr><td align="left" valign="top">Code-Mixing Index (CMI), %, mean (SD)</td><td align="left" valign="top">37.38 (10.43)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Transliterated Hindi refers to text written in Hindi using the Latin script, often incorporating English loan words common in sexual and reproductive health (SRH) contexts (eg, periods, condom, and pregnancy).</p></fn><fn id="table3fn2"><p><sup>b</sup>Intrasentential mixing refers to text where Hindi and English elements are combined within the same sentence, written in Latin script.</p></fn></table-wrap-foot></table-wrap><p>Our approach to annotation serves 2 purposes. First, hierarchical classification offers a deeper perspective on SRH concerns, allowing the identification of common themes and trends in user queries. It strengthens decision support for stakeholders, including health care professionals, policymakers, and organizations, by offering a well-organized dataset that can inform better resource allocation and intervention strategies. Second, hierarchical classification can enhance user guidance and response accuracy based on user intent, ensuring that individuals are directed to relevant and specific information tailored to their concerns.</p></sec><sec id="s2-3"><title>CMI Calculation</title><p>The CMI [<xref ref-type="bibr" rid="ref43">43</xref>] is an utterance-level, ratio-based metric that quantifies intrasentential code-mixing by measuring the proportion of lexical tokens that do not belong to the matrix (dominant) language.</p><p>CMI is defined as:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mtext>CMI</mml:mtext><mml:mo>=</mml:mo><mml:mn>100</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mrow><mml:mo movablelimits="true" form="prefix">max</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>u</mml:mi></mml:mrow></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mtext>if&#x00A0;</mml:mtext><mml:mi>n</mml:mi><mml:mo>&#x003E;</mml:mo><mml:mi>u</mml:mi></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Where <inline-formula><mml:math id="ieqn1"><mml:mi>n</mml:mi></mml:math></inline-formula> is the total number of tokens, <inline-formula><mml:math id="ieqn2"><mml:mi>u</mml:mi></mml:math></inline-formula> is the number of language-independent tokens, <inline-formula><mml:math id="ieqn3"><mml:mi>n</mml:mi><mml:mo>-</mml:mo><mml:mi>u</mml:mi></mml:math></inline-formula> is the sum of the number of tokens from N languages, and <inline-formula><mml:math id="ieqn4"><mml:mrow><mml:mrow><mml:mi mathvariant="normal">max</mml:mi></mml:mrow><mml:mo>&#x2061;</mml:mo><mml:mrow><mml:mo>{</mml:mo></mml:mrow></mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>}</mml:mo><mml:mi> </mml:mi></mml:math></inline-formula>is the highest number of words belonging to a particular language. If an utterance contains only language-independent tokens (<inline-formula><mml:math id="ieqn5"><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mi>u</mml:mi></mml:math></inline-formula>), the CMI is defined as zero.</p><p>We used the pretrained FastText (Meta AI) language identification model, which supports 176 languages [<xref ref-type="bibr" rid="ref44">44</xref>], to automatically assign language labels at the token level. FastText has been widely used for language identification in multilingual and code-mixed text due to its efficiency and robustness on short lexical units, and its use of subword information [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Most queries in the dataset are bilingual in nature, with one dominant language, either English or Romanized Hindi, serving as the matrix language within each utterance. We used the model to identify English tokens (and assumed the remaining tokens to be Hindi) due to potential concerns about its performance on Hindi words. Numerals, punctuation, and symbols were treated as language-independent. This automated procedure enabled consistent computation of sentence-level CMI scores across all 4161 queries.</p></sec><sec id="s2-4"><title>Data Annotation</title><p>We annotated the user queries into 8 categories, which were further divided into subcategories to capture the depth and complexity of the topics. The annotation of our code-mixed dataset on SRH required both strong linguistic competence and domain expertise. Since the data are code-mixed (Hindi and English), they present unique represents unique challenges such as nonstandard spelling, switching between languages in the same query, and the cultural nuances embedded in users&#x2019; expressions. To meet these complexities, this study included 2 experienced annotators with backgrounds in public health and global health. Both annotators are native or fluent speakers of Indian languages and coauthors of this paper, ensuring familiarity with cultural and linguistic contexts relevant to SRH queries. Both were guided by a comprehensive annotation manual, refined by subject matter experts (public health care professionals and medical doctors). Some user questions could overlap with multiple topics or subtopics (for instance, a question about convincing a partner to use contraceptives could fall under the &#x201C;Family Planning&#x201D; or &#x201C;Marriage and Relationship&#x201D; category). In such cases, annotators selected the closest-fitting category. The full annotation guidelines are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To ensure annotation quality and consistency, we adopted an iterative annotation process. Initially, 10% of the dataset was annotated independently by both annotators. This step was particularly critical, as it allowed us to refine annotation guidelines and address challenges arising from orthographic variations, code-mixing, and semantic ambiguities. Discrepancies were resolved through discussion and led to refinements in the annotation guidelines. We continued the iterative co-annotation process until the annotators consistently achieved a 95% agreement threshold during the pilot phase. Once the threshold was met, the remaining data were divided between the 2 annotators for independent annotation. We used Cohen kappa [<xref ref-type="bibr" rid="ref47">47</xref>] across overlapping annotated samples to compute interannotator agreement. The resulting score of 83% indicated substantial agreement, reflecting almost perfect agreement [<xref ref-type="bibr" rid="ref48">48</xref>]. The distribution of the query categories and subcategories is provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p></sec><sec id="s2-5"><title>Evaluation Metrics</title><p>To evaluate model performance on our hierarchical classification task, we adopted the Hierarchical <italic>F</italic><sub>1</sub>-score (h<italic>F</italic><sub>1</sub>) proposed by Kosmopoulos et al [<xref ref-type="bibr" rid="ref23">23</xref>]. It extends traditional metrics by explicitly considering the hierarchical relationship among labels. To compute this metric, we augment both the predicted labels (<inline-formula><mml:math id="ieqn6"><mml:mover accent="true"><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula>) and the ground truth labels (<inline-formula><mml:math id="ieqn7"><mml:mi>Y</mml:mi></mml:math></inline-formula>) by incorporating their ancestor labels from the hierarchy, resulting in augmented sets <inline-formula><mml:math id="ieqn8"><mml:mover accent="true"><mml:mrow><mml:mi>Y</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:math></inline-formula> <italic>and</italic> <inline-formula><mml:math id="ieqn9"><mml:mi>Y</mml:mi><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:math></inline-formula>. This augmentation ensures that the evaluation accurately reflects hierarchical dependencies between topics and subtopics.</p><p>The hierarchical <italic>F</italic><sub>1</sub>-score (h<italic>F</italic><sub>1</sub>) is defined as:</p><disp-formula id="E1"><label>(1)</label><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mi>h</mml:mi><mml:msub><mml:mi>F</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#x22C5;</mml:mo><mml:mi>h</mml:mi><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mo>&#x22C5;</mml:mo><mml:mi>h</mml:mi><mml:mi>R</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mo>+</mml:mo><mml:mi>h</mml:mi><mml:mi>R</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mtext>&#x00A0;where&#x00A0;</mml:mtext><mml:mi>h</mml:mi><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:munder><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:munder><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mi>h</mml:mi><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:munder><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mover><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mo>&#x2229;</mml:mo><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:munder><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi></mml:munder><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">|</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>Here, <inline-formula><mml:math id="ieqn10"><mml:mi>h</mml:mi><mml:mi>P</mml:mi><mml:mi>r</mml:mi></mml:math></inline-formula> and <inline-formula><mml:math id="ieqn11"><mml:mi>h</mml:mi><mml:mi>R</mml:mi><mml:mi>e</mml:mi></mml:math></inline-formula> (hierarchical precision and recall) evaluate the proportion of correctly predicted hierarchical labels among all predicted and actual labels, respectively. Additionally, we used supporting metrics, including Exact Match, which evaluates the percentage of queries where both the topic and subtopic predictions exactly match the ground truth labels. Accuracy@l<sub>1</sub> measures the topic-level accuracy, reflecting the proportion of samples where the predicted topic is correct. Accuracy@l<sub>2</sub> measures conditional subtopic accuracy, defined as the proportion of correct subtopic predictions given that the topic was correctly predicted. These complementary metrics collectively provide a comprehensive evaluation of the model&#x2019;s performance, capturing strict end-to-end correctness (Exact Match)<italic>,</italic> coarse-grained topic identification (Accuracy@l<sub>1</sub>), fine-grained subtopic discrimination (Accuracy@l<sub>2</sub>), and hierarchical sensitivity (h<italic>F</italic><sub>1</sub>).</p></sec><sec id="s2-6"><title>Experimental Analysis</title><p>In this study, we evaluate the performance of language models on a hierarchical classification task in a zero-shot setting, where the model attempts to generalize output patterns at the topic and subtopic levels for data it has not encountered during training. Zero-shot evaluation is particularly challenging because it requires models to infer patterns without explicit task-specific fine-tuning.</p><p>In our setup, models were provided with the hierarchical structure and the list of possible topic and subtopic labels, but not with the detailed semantic definitions used in the human annotation guideline. This was a deliberate design choice to evaluate the models&#x2019; intrinsic ability to interpret real-world, code-mixed SRH queries without additional semantic instruction. It also reflects practical deployment scenarios where the model must generalize from raw user input rather than rely on curated taxonomies at the time of inference.</p><p>Our evaluation includes 7 open-weight multilingual models, 5 Indic-specific models, and 3 proprietary models. The open-weight multilingual models are Mixtral-8 &#x00D7; 7B-Instruct (Mistral AI) [<xref ref-type="bibr" rid="ref49">49</xref>], Llama-3.1-8B-Instruct (Meta) [<xref ref-type="bibr" rid="ref32">32</xref>], Llama-3.3-70B-Instruct (Meta), Gemma-2-9B-IT (Google) [<xref ref-type="bibr" rid="ref50">50</xref>], Gemma-3-27B-IT (Google) [<xref ref-type="bibr" rid="ref33">33</xref>], Qwen-2.5-7B-Instruct (Alibaba Group) [<xref ref-type="bibr" rid="ref51">51</xref>], and Aya-Expanse-8B (Cohere Labs) [<xref ref-type="bibr" rid="ref52">52</xref>]. The technical reports of these models demonstrate their multilingual capabilities on benchmark datasets, making them strong candidates for handling code-mixed data.</p><p>The Indic models are Airavata [<xref ref-type="bibr" rid="ref53">53</xref>], an instruction-tuned Hindi language model developed by AI4Bharat; AryaBhatta-GemmaGenZ-Vikas-Merged [<xref ref-type="bibr" rid="ref54">54</xref>] (referred to as AryaBhatta [GenVR Research]), a model trained on 9 Indic languages that excels particularly in Hindi reasoning and literature tasks; and Llama-3-Gaja-Hindi-8B [<xref ref-type="bibr" rid="ref55">55</xref>], a bilingual Hindi-English LLM specialized in Indic language understanding. We also evaluate Krutrim-2-Instruct [<xref ref-type="bibr" rid="ref56">56</xref>], a Mistral-NeMo (Mistral AI and NVIDIA)&#x2013;based model trained on diverse domains, including Indic languages, and fine-tuned with direct preference optimization to improve alignment and reasoning for Indian contexts. For proprietary baselines, we include GPT-5 [<xref ref-type="bibr" rid="ref30">30</xref>], GPT-4o [<xref ref-type="bibr" rid="ref31">31</xref>], and Claude-3.5-Sonnet [<xref ref-type="bibr" rid="ref57">57</xref>], which serve as state-of-the-art performance references despite their closed parameter counts.</p><p>For open-weight multilingual models, we include a range spanning smaller (7B-9B) to larger (27B-70B) architectures. Alongside these, we also include Sarvam-M (Sarvam AI) [<xref ref-type="bibr" rid="ref58">58</xref>], a state-of-the-art open-source hybrid Indic LLM built on Mistral-Small, designed to enhance reasoning in Indian languages through supervised fine-tuning and reward-based reinforcement learning. Since Indic LLMs are still emerging, our evaluation also includes small to mid-sized versions currently available (7B-12B). This selection allows for a balanced comparison across Indic, multilingual, and proprietary systems, highlighting the relative strengths and limitations of each category.</p><p>With LLMs, a precise and well-structured prompt is paramount for shaping responses and ensuring that the model focuses on the most relevant aspects of the input. Recent research [<xref ref-type="bibr" rid="ref59">59</xref>] has shown that even minor lexical variation&#x2014;sometimes just a single-word difference&#x2014;in prompts can significantly impact model performance on downstream tasks. To accurately determine topic and subtopic levels in our task, we experimented with multiple prompt variations on a small, representative subset of our dataset. This iterative process allowed us to refine the prompts efficiently before running the large-scale evaluation. Smaller open-weight models were evaluated locally on a graphics processing unit with 48GB of memory (eg, NVIDIA RTX A6000), while proprietary models were accessed via application programming interface&#x2013;based inference. The final prompt template used across all models is shown in Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. The prompt template also requested a self-reported confidence score (0.0&#x2010;1.0); however, this value was extracted for future analysis but was not used in this evaluation.</p></sec><sec id="s2-7"><title>Qualitative Error Analysis</title><p>To better understand the strengths and limitations of the models, we conducted a qualitative error analysis of the 2 best-performing models in each family (proprietary, multilingual open-weight, and Indic models). For each model, we sampled 50 misclassified test queries stratified by topic and subtopic. The same 2 annotators who created the dataset served as reviewers and independently coded thematic errors; disagreements were resolved by consensus. Results are summarized in the &#x201C;Error Analysis&#x201D; section, with examples in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of 6 models on representative sexual and reproductive health queries (SRHQ) in the SRHQ-India dataset queries at the topic-subtopic level.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error types</td><td align="left" valign="bottom">Query (Hinglish&#x2192;English)</td><td align="left" valign="bottom">Ground truth<break/>(Topic&#x2192;Subtopic)</td><td align="left" valign="bottom">GPT-5</td><td align="left" valign="bottom">Claude-3.5-Sonnet</td><td align="left" valign="bottom">Llama-3.3-70B-Instruct</td><td align="left" valign="bottom">Gemma-3-27B-IT</td><td align="left" valign="bottom">Sarvam-M</td><td align="left" valign="bottom">Llama-3-Gaja-Hindi-8B</td></tr></thead><tbody><tr><td align="left" valign="top">C1</td><td align="left" valign="top">Safaiya kaise karwate hain?<break/>(How is an abortion done?)</td><td align="left" valign="top">Pregnancy and <sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>PNC&#x2192;Abortion</td><td align="left" valign="top">&#x2713;<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">C2</td><td align="left" valign="top">Pregnancy me 5 month me vomiting hoti h to kya kre<break/>(What should I do if I am vomiting in the 5th month of pregnancy?)</td><td align="left" valign="top">Pregnancy and PNC&#x2192;Antepartum</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">C3</td><td align="left" valign="top">Family planning Muslim community me accept hain kya?<break/>(Is family planning accepted in the Muslim community?)</td><td align="left" valign="top">Contraception and Family Planning&#x2192;Family Planning Queries</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td></tr><tr><td align="left" valign="top">C3</td><td align="left" valign="top">Masik pali aane se Bhagvan ke pas kau nahi jana chahiye?<break/>(Why should one avoid going to god/temples during periods?)</td><td align="left" valign="top">Other&#x2192;Cultural, Religious, or Moral Norms</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">C4</td><td align="left" valign="top">1 sal se bacha rukhne ke liye try kar rahe hai lekin nahi rukh raha hai to kya karna padega?<break/>(We have been trying to have a child for one year, but it has not happened. What should we do?)</td><td align="left" valign="top">Pregnancy and PNC&#x2192;Infertility</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">C4</td><td align="left" valign="top">Family planning may agar koi mahila test tube karvati hai to kya hota hai?<break/>(In family planning, if a woman undergoes test-tube treatment, what happens?)</td><td align="left" valign="top">Pregnancy and PNC&#x2192;Infertility</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">C5</td><td align="left" valign="top">Kitni der jinda rahte hain sperm?<break/>(How long do sperm stay alive?)</td><td align="left" valign="top">Sexual and Vaginal Health&#x2192;Reproductive Anatomy</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2713;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>PNC: postnatal care.</p></fn><fn id="table4fn2"><p><sup>b</sup>&#x2713; indicates correct topic-subtopic classification.</p></fn><fn id="table4fn3"><p><sup>c</sup>&#x2014; indicates incorrect topic-subtopic classification</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-8"><title>Ethical Considerations</title><p>We used deidentified secondary data shared by the Myna Mahila Foundation. Since the dataset was fully deidentified before being shared, no personally identifiable information was accessible to the research team at Emory University. This study did not involve direct interaction with human participants, and no additional data collection was conducted. Approval for the use of this secondary dataset was obtained from Emory University&#x2019;s Institutional Review Board (Protocol #2025P011010). Data security measures were implemented to maintain confidentiality, and the dataset was stored on a secure, password-protected system. Only authorized personnel had access to the data. As the study relies on pre-existing deidentified data, the risk to individuals is minimal.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>We now present the results of the hierarchical classification of SRHQ with LLMs. <xref ref-type="table" rid="table5">Table 5</xref> compares model performance on the hierarchical classification across 3 types of models: open-weight, Indic (LLMs fine-tuned for Indian languages), and proprietary. <xref ref-type="fig" rid="figure2">Figure 2</xref> provides a visual comparison across 4 evaluation metrics: hierarchical <italic>F</italic><sub>1</sub>(h<italic>F</italic><sub>1</sub>), Exact Match, and accuracy at the topic (l<sub>1</sub>) and subtopic (l<sub>2</sub>) levels. Although we also evaluated GPT-4o, its performance was less than 1.5% lower than GPT-5 across metrics. Bootstrap-based 95% CIs further show substantial overlap between GPT-5 (h<italic>F</italic><sub>1</sub>=0.784, 95% CI 0.774&#x2010;0.795) and GPT-4o (h<italic>F</italic><sub>1</sub>=0.779, 95% CI 0.768&#x2010;0.789), indicating that performance differences between the 2 proprietary systems are not statistically distinguishable (Table S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). For consistency and balanced cross-category comparisons, we focus here on GPT-5 and Claude-3.5-Sonnet as representative proprietary models.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Performance comparison of different models in a zero-shot setting on the sexual and reproductive health queries (SRHQ) dataset. Open-weight, proprietary, and Indic models are distinctly highlighted for clarity. All Indic models are open weights. In each category, the best-performing model is highlighted in <bold>bold</bold>, and the second-best is <underline>underlined</underline>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category and model</td><td align="left" valign="bottom">#Params</td><td align="left" valign="bottom">h<italic>F</italic><sub>1</sub> (hierarchical <italic>F</italic>1)</td><td align="left" valign="bottom">Exact Match</td><td align="left" valign="bottom">Accuracy@l<sub>1</sub></td><td align="left" valign="bottom">Accuracy@l<sub>2</sub></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">Open-weight models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Mixtral-8 &#x00D7; 7B-Instruct</td><td align="left" valign="top">7B</td><td align="left" valign="top">0.593</td><td align="left" valign="top">0.453</td><td align="left" valign="top">0.733</td><td align="left" valign="top">0.617</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama-3.1-8B-Instruct</td><td align="left" valign="top">8B</td><td align="left" valign="top">0.630</td><td align="left" valign="top">0.491</td><td align="left" valign="top">0.769</td><td align="left" valign="top">0.638</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen-2.5-7b-Instruct</td><td align="left" valign="top">7B</td><td align="left" valign="top">0.605</td><td align="left" valign="top">0.463</td><td align="left" valign="top">0.747</td><td align="left" valign="top">0.619</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Aya-Expanse-8B</td><td align="left" valign="top">8B</td><td align="left" valign="top">0.528</td><td align="left" valign="top">0.411</td><td align="left" valign="top">0.646</td><td align="left" valign="top">0.636</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gemma-2-9B-IT</td><td align="left" valign="top">9B</td><td align="left" valign="top">0.657</td><td align="left" valign="top">0.544</td><td align="left" valign="top">0.770</td><td align="left" valign="top">0.706</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gemma-3-27B-IT</td><td align="left" valign="top">27B</td><td align="left" valign="top"><underline>0.739</underline></td><td align="left" valign="top"><underline>0.629</underline></td><td align="left" valign="top"><underline>0.849</underline></td><td align="left" valign="top"><underline>0.741</underline></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama-3.3-70B-Instruct</td><td align="left" valign="top">70B</td><td align="left" valign="top"><bold>0.742</bold></td><td align="left" valign="top"><bold>0.630</bold></td><td align="left" valign="top"><bold>0.853</bold></td><td align="left" valign="top"><bold>0.738</bold></td></tr><tr><td align="left" valign="top" colspan="6">Indic models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Airavata</td><td align="left" valign="top">7B</td><td align="left" valign="top">0.404</td><td align="left" valign="top">0.226</td><td align="left" valign="top">0.581</td><td align="left" valign="top">0.389</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Llama-3-Gaja-Hindi-8B</td><td align="left" valign="top">8B</td><td align="left" valign="top"><underline>0.596</underline></td><td align="left" valign="top"><underline>0.452</underline></td><td align="left" valign="top"><underline>0.740</underline></td><td align="left" valign="top"><underline>0.610</underline></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>AryaBhatta</td><td align="left" valign="top">8.5B</td><td align="left" valign="top">0.365</td><td align="left" valign="top">0.157</td><td align="left" valign="top">0.574</td><td align="left" valign="top">0.273</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Krutrim-2-Instruct</td><td align="left" valign="top">12B</td><td align="left" valign="top">0.558</td><td align="left" valign="top">0.386</td><td align="left" valign="top">0.731</td><td align="left" valign="top">0.527</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sarvam-M</td><td align="left" valign="top">24B</td><td align="left" valign="top"><bold>0.757</bold></td><td align="left" valign="top"><bold>0.647</bold></td><td align="left" valign="top"><bold>0.867</bold></td><td align="left" valign="top"><bold>0.747</bold></td></tr><tr><td align="left" valign="top" colspan="6">Proprietary models</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-5</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></td><td align="left" valign="top"><bold>0.784</bold></td><td align="left" valign="top"><bold>0.683</bold></td><td align="left" valign="top"><bold>0.886</bold></td><td align="left" valign="top"><bold>0.771</bold></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4o<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.779</td><td align="left" valign="top">0.675</td><td align="left" valign="top">0.882</td><td align="left" valign="top">0.764</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude-3.5-Sonnet</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top"><underline>0.745</underline></td><td align="left" valign="top"><underline>0.639</underline></td><td align="left" valign="top"><underline>0.851</underline></td><td align="left" valign="top"><underline>0.751</underline></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Not available.</p></fn><fn id="table5fn2"><p><sup>b</sup>Although GPT-4o was the second-best-performing proprietary model, its performance differed from GPT-5 by less than 1.5% across metrics. Therefore, GPT-5 and Claude-3.5-Sonnet were selected as representative proprietary models for balanced cross-category comparisons in the main analysis.</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Visual comparison of model performance on the sexual and reproductive health queries (SRHQ) dataset across 4 evaluation metrics: hierarchical <italic>F</italic><sub>1</sub>(h<italic>F</italic><sub>1</sub>), Exact Match, Accuracy@l<sub>1</sub>, and Accuracy@l<sub>2</sub>. The figure highlights relative strengths and differences among open-weight, Indic, and proprietary models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86545_fig02.png"/></fig><p>Proprietary systems delivered the highest and most consistent performance, with GPT-5 achieving the best results across all metrics, with h<italic>F</italic><sub>1</sub>=0.784, Exact Match=0.683, and accuracies of 0.886 at l<sub>1</sub> and 0.771 at l<sub>2</sub>. Claude-3.5-Sonnet closely followed (h<italic>F</italic><sub>1</sub>=0.745 and Exact Match=0.639), exceeding 0.85 topic-level accuracy and 0.75 subtopic-level accuracy.</p><p>Among open-weight models, the strongest performers were Llama-3.3-70B-Instruct and Gemma-3-27B-IT, which achieved performance close to the proprietary models, with h<italic>F</italic><sub>1</sub> scores of 0.742 and 0.739 and Exact Match scores of 0.630 and 0.629, respectively. Both models exceeded 0.84 in topic-level accuracy (l<sub>1</sub>) and maintained strong subtopic accuracy (l<sub>2</sub> &#x2248; 0.74) with scores of 0.738 and 0.741. Mid-sized models such as Gemma-2-9B-IT also performed competitively (h<italic>F</italic><sub>1</sub>=0.657, Exact Match=0.544, l<sub>1</sub>=0.770, and l<sub>2</sub>=0.706), whereas smaller models such as Aya-Expanse-8B underperformed across metrics (h<italic>F</italic><sub>1</sub>=0.528, Exact Match=0.411) and struggled particularly at the subtopic level (l<sub>2</sub>=0.636). <xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates this scaling effect clearly, with larger multilingual models covering a broader area across h<italic>F</italic><sub>1</sub>, Exact Match, and both accuracy levels compared to their smaller counterparts.</p><p>Indic models showed the widest variability, with Sarvam-M emerging as a strong outlier, achieving h<italic>F</italic><sub>1</sub>=0.757 and Exact Match=0.647. Compared to GPT-5, Sarvam-M fell short by only 3.4% in h<italic>F</italic><sub>1</sub>. At the topic level, it reached an accuracy of l<sub>1</sub>=0.867, and at the subtopic level l<sub>2</sub>=0.747, only slightly below the top-performing model GPT-5 (0.886 and 0.771, respectively). This indicates that Sarvam-M is the strongest free and open-weight alternative to proprietary models, considering the restricted access and cost associated with proprietary models. Other Indic models performed substantially lower; the second-highest-performing Indic model was Llama-3-Gaja-Hindi-8B, achieving an h<italic>F</italic><sub>1</sub> of 0.596, an Exact Match of 0.452, and accuracies of 0.740 at l<sub>1</sub> and 0.610 at l<sub>2</sub>. Interestingly, it outperformed the larger Krutrim-2-Instruct (12B; OLA Krutrim Team) (h<italic>F</italic><sub>1</sub>=0.558, Exact Match=0.386, l<sub>1</sub>=0.731, and l<sub>2</sub>=0.527), while Airavata demonstrated lower performance across metrics (h<italic>F</italic><sub>1</sub>=0.404, Exact Match=0.226, l<sub>1</sub>=0.581, and l<sub>2</sub>=0.389). These comparisons indicate that parameter count alone does not guarantee superior performance; factors such as model design and training data quality appear to play a more decisive role. By contrast, Airavata and AryaBhatta consistently trailed across all metrics, clustering at the lowest end of performance in both topic- and subtopic-level accuracy.</p><p>The results highlight a consistent performance hierarchy across model categories. Proprietary systems dominated overall, while large open-weight multilingual models narrowed the gap across all metrics, and Indic models demonstrated both promise for local contexts and limitations. Sarvam-M stood out as a competitive alternative, while other Indic models struggled to identify user intent in code-mixed settings. The combination of <xref ref-type="table" rid="table5">Table 5</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref> underscores not only differences in overall performance but also the critical difficulty of achieving accurate subtopic-level predictions in mixed-language SRH queries.</p><p>To further examine performance differences, <xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref> present <italic>F</italic><sub>1</sub> scores across SRHQ-India topics. <xref ref-type="fig" rid="figure3">Figure 3</xref> highlights per-topic <italic>F</italic><sub>1</sub> scores alongside model-wise averages, where the reported average <italic>F</italic><sub>1</sub> is the macro-average of flat, topic-level <italic>F</italic><sub>1</sub> scores computed across all topics for each model. Proprietary model GPT-5 achieved the highest overall average <italic>F</italic><sub>1</sub> score (0.85), while Sarvam-M emerged as the strongest open-weight Indic alternative with an average of 0.82, with performance comparable to Claude-3.5-Sonnet (0.79) and other open-weight models such as Llama-3.3-70B (0.78) and Gemma-3-27B (0.79). Other Indic models, such as Llama-3-Gaja-Hindi-8B (CognitiveLab), showed weaker averages and wider variability, particularly in nuanced domains.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Model-wise topic-level <italic>F</italic><sub>1</sub> scores across the sexual and reproductive health queries (SRHQ)-India dataset. Each bar represents the flat <italic>F</italic><sub>1</sub> score for a specific topic, while the dotted lines indicate the macro-average of topic-level <italic>F</italic><sub>1</sub> scores across all topics for each model. Results highlight differences between proprietary, open-weight, and Indic models. HIV: human immunodeficiency virus; PCOD: polycystic ovarian disease; PCOS: polycystic ovary syndrome; PNC: postnatal care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86545_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Topic-level <italic>F</italic><sub>1</sub> scores with 95% CIs across sexual and reproductive health queries (SRHQ)-India. Each bar represents the <italic>F</italic><sub>1</sub> score for a specific topic, with error bars indicating 95% CIs estimated via bootstrap resampling. Results highlight performance differences across proprietary, open-weight, and Indic models. HIV: human immunodeficiency virus; PCOD: polycystic ovarian disease; PCOS: polycystic ovary syndrome; PNC: postnatal care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86545_fig04.png"/></fig><p><xref ref-type="fig" rid="figure4">Figure 4</xref> provides a closer look at topic-level differences across models. Proprietary systems maintained stability across nearly all categories, performing strongest in human immunodeficiency virus, Contraception and Family Planning, and Menstrual Health, which require precise and time-sensitive classification. Although human immunodeficiency virus was represented in fewer queries, performance remained consistent across models, suggesting less variability in how these queries are expressed. Sarvam-M again demonstrated competitive results, closely tracking proprietary and large multilingual models in categories such as Contraception and Family Planning, Menstrual Health, and polycystic ovary syndrome or polycystic ovarian disease. However, all models showed relative declines in Sexual and Vaginal Health, and Mental Health and Wellness, where queries frequently overlap with adjacent categories and require additional context for a clear-label decision. In these topics, diverse expressions of intent and limited sample sizes contribute to greater uncertainty, particularly for Mental Health and Wellness, where even a small number of misclassifications can substantially affect performance estimates. These findings highlight that while proprietary systems remain strongest, large multilingual open-weight models substantially close the gap, and Sarvam-M stands out as a promising Indic alternative. Yet, across all families, accurate subtopic-level predictions in code-mixed SRH queries continue to be the most persistent challenge.</p><p>To further examine where models succeed and fail, <xref ref-type="fig" rid="figure5">Figure 5</xref> presents confusion matrices for proprietary, open-weight, and Indic models, highlighting recurring misclassification patterns. Consistent weaknesses appeared in overlap-prone and context-dependent topics such as Other, Mental Health and Wellness, and Sexual and Vaginal Health, where misclassifications were frequent and often collapsed into broader categories. Proprietary systems, particularly GPT-5, demonstrated the most balanced performance across all topics, while Claude-3.5-Sonnet showed sharper drops in Other categories despite strong results elsewhere. Open-weight models (Gemma-3-27B and Llama-3.3-70B) performed competitively but exhibited greater instability in culturally sensitive areas, including frequent misclassification of Sexual and Vaginal Health queries into the Pregnancy and PNC or Other category. Among Indic models, Sarvam-M emerged as the strongest, closely mirroring proprietary models&#x2019; performance by correctly classifying over 90% of queries in Menstrual Health, Contraception and Family Planning, and Mental Health and Wellness. However, it still struggled in fine-grained intent recognition, with frequent misclassification of Contraception and Family Planning into the Pregnancy and PNC or Menstrual Health category. By contrast, smaller Indic systems such as Llama-3-Gaja-Hindi-8B displayed widespread confusions across sensitive categories and a notable tendency to produce &#x201C;NotValid&#x201D; predictions, where models generated labels not included in the predefined SRH hierarchy shown in the prompt. For clarity in <xref ref-type="fig" rid="figure5">Figure 5</xref>, predictions that fall outside the predefined SRH hierarchy are grouped under a &#x201C;NotValid&#x201D; category<italic>.</italic> These out-of-hierarchy outputs receive no credit in the hierarchical evaluation. These cases underscore the limited ability of smaller Indic models to consistently map inputs to the defined label structure and reflect the challenges related to limited training data and weaker alignment with the task structure.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Topic-level confusion matrices for 6 models representing proprietary, open-weight, and Indic categories. Each cell shows counts (black) and row-normalized percentages. Correct classifications appear in green diagonal cells with percentages in blue. Misclassifications appear in off-diagonal yellow cells, with percentages shown in red. &#x201C;NotValid&#x201D; denotes predictions that do not belong to any valid topic. HIV: human immunodeficiency virus; PCOD: polycystic ovarian disease; PCOS: polycystic ovary syndrome; PNC: postnatal care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86545_fig05.png"/></fig></sec><sec id="s3-2"><title>Error Analysis</title><p>To better understand model behavior beyond aggregate metrics, we conducted a qualitative analysis of errors from 6 representative models (GPT-5, Claude-3.5-Sonnet, Llama-3.3-70B-Instruct, Gemma-3-27B-IT, Sarvam-M, and Llama-3-Gaja-Hindi-8B) spanning proprietary, multilingual open-weight, and Indic categories. For each model, we sampled 50 misclassified queries and performed a detailed review of their errors. <xref ref-type="table" rid="table4">Table 4</xref> presents examples of misclassification. Additionally, we included error analysis for GPT-5 and GPT-4o in Table S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, highlighting cases where GPT-4o misclassified queries that GPT-5 handled correctly.</p><sec id="s3-2-1"><title>C1: Lexical Ambiguity and Euphemisms</title><p>In <xref ref-type="table" rid="table4">Table 4</xref>, row 1, the query <italic>&#x201C;</italic>safaiya kaise karwate hain?&#x201D; translates to &#x201C;How is an abortion done?&#x201D; and should be classified as Pregnancy and PNC&#x2192;Abortion. However, only the proprietary models assigned the topic and subtopic correctly; both open-weight and Indic models misclassified the query under Sexual and Vaginal Health, Menstrual Health, or General Health Queries<italic>.</italic> As detailed in Table S4 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, model-generated explanations indicate that these errors stem from the literal interpretation of the colloquial term &#x201C;safaiya&#x201D; as hygiene, sanitary products, or general cleaning practices. In practice, however, the term is commonly used to refer to the abortion evacuation procedure in the Indian context [<xref ref-type="bibr" rid="ref60">60</xref>]. This example highlights how culturally embedded euphemisms widely used in Indian contexts can mislead models if culturally specific meanings are not recognized.</p></sec><sec id="s3-2-2"><title>C2: Misclassifying Stage in the Reproductive Health Journey</title><p>As shown in <xref ref-type="table" rid="table4">Table 4</xref>, row 2, models frequently confused different pregnancy stages. The query <italic>&#x201C;</italic>Pregnancy me 5 month me vomiting hoti hya to kya kre&#x201D; translates to &#x201C;What should I do if I am vomiting in the 5th month of pregnancy?&#x201D; which pertains to the antepartum stage (pregnancy period). Of the 6 best-performing models, only GPT-5, Claude-3.5-Sonnet, and Sarvam-M predicted this correctly. The remaining models classified this query under more general Pregnancy Information, thereby failing to distinguish the specific stage of the reproductive health journey.</p><p>As illustrated in Table S5 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, these models recognized the pregnancy context but did not incorporate gestational timing into their subtopic selection. This distinction is clinically important, as symptoms such as vomiting have different implications and recommended guidance during the antepartum vs postpartum periods. Assigning such queries in overly broad subcategories such as <italic>&#x201C;</italic>Pregnancy Information&#x201D; dilutes the specificity of the advice that models are expected to provide and risks misalignment with clinical best practices.</p></sec><sec id="s3-2-3"><title>C3: Cultural and Religious Context Error</title><p>In <xref ref-type="table" rid="table4">Table 4</xref>, row 3, the user query &#x201C;Family planning Muslim community me accept hain kya?&#x201D; translates to &#x201C;Is family planning accepted in the Muslim community?&#x201D; reflects a user&#x2019;s religious concern regarding the acceptability of family planning. In our evaluation, both open-weight models correctly assigned the topic and subtopic, while others&#x2014;including proprietary and Indic models&#x2014;placed the query under Cultural, Religious, or Moral Norms. Model-generated reasons (Table S6 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) indicate that these models focused on the religious framing of the questions rather than their underlying health-related intent. Similarly, in row 4, the query &#x201C;Masik pali aane se Bhagvan ke pas kyo nahi jana chahiye?&#x201D; (Why should one avoid going to temples during menstruation?) reflects a culturally and religiously grounded belief rather than a medical concern. GPT-5, Claude-3.5-Sonnet, and Sarvam-M correctly classified this query under Cultural, Religious, or Moral Norms. In contrast, 3 other models, including both open-weight and Indic models, classified it under health-related categories such as Menstrual Health or Misconceptions and Myths. Model-generated reasons (Table S6 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) suggest that these systems focused on biological or psychological interpretations of menstruation rather than the user&#x2019;s culturally grounded concern.</p><p>In both examples above, the alternative categorizations are semantically plausible given the cultural framing of the user queries; therefore, we treat these cases as a routing-level issue rather than a linguistic error. We later discuss how we might deal with such ambiguous cases when classifying user intent in the &#x201C;Limitations and Future Works&#x201D; subheading.</p></sec><sec id="s3-2-4"><title>C4: Misunderstanding Reproductive Intent</title><p>This category captures cases where models failed to recognize the user&#x2019;s reproductive intent, often reversing it from wanting to conceive to wanting to avoid conception. For example, in <xref ref-type="table" rid="table4">Table 4</xref>, row 5, the query &#x201C;1 sal se bacha rukhne ke liye try kar rahe hai lekin nahi rukh raha hai to kya karna padega?&#x201D; translates to &#x201C;We have been trying to have a child for one year, but it has not happened. What should we do?&#x201C; and reflects the issue of infertility, a deeply personal and emotionally sensitive topic. However, only the proprietary model classified this query correctly. Model-generated reasons from other systems (Table S7 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) indicate that the colloquial phrase &#x201C;bacha rukna,&#x201D; which in India commonly means conceiving, was interpreted literally as preventing pregnancy. As a result, these models prioritized Contraception and Family Planning subtopics such as Family Planning Queries and Contraceptive Effectiveness and Duration<italic>.</italic> While such interpretations are not inherently logical given the taxonomy, they diverge from the user&#x2019;s underlying intent of seeking fertility-related guidance.</p><p>A similar misalignment occurred in row 6 with the query <italic>&#x201C;</italic>Family planning me agar koi mahila test tube karvati hai to kya hota hai?&#x201D; (&#x201C;In family planning, if a woman undergoes a test tube procedure, what happens?&#x201D;). Here, the user was referring to IVF, a fertility treatment aimed at achieving pregnancy. Yet all models categorized the query under Family Planning or Sterilization rather than Infertility. This demonstrates how culturally specific terms such as &#x201C;test tube karvati hai&#x201D; <italic>can in</italic>vert user intent, classifying conception-seeking queries as if they were about contraception.</p><p>From an SRH perspective, such errors are critical, since infertility queries require guidance on fertility assessment and treatment. Misclassifying these queries risks invalidating user concerns, reinforcing stigma, as infertility in India is often associated with blame and silence, particularly for women [<xref ref-type="bibr" rid="ref61">61</xref>]. Treating these queries as matters of pregnancy avoidance could further marginalize users seeking support.</p></sec><sec id="s3-2-5"><title>C5: Misclassifying Questions About Bodily Anatomy or Processes</title><p>In <xref ref-type="table" rid="table4">Table 4</xref>, row 7, the query <italic>&#x201C;</italic>Kitni der jinda rahte hain sperm?&#x201D; translates to &#x201C;How long do sperm stay alive?&#x201C; and should be categorized under Sexual and Vaginal Health, subtopic Reproductive Anatomy. Only 1 model (Claude-3.5-Sonnet) correctly identified both the topic and subtopic. Other models routed the query to categories such as Sexual and Vaginal Health, Contraception and Family Planning, Pregnancy and PNC, or General Health Queries (Table S8 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Model-generated reasons indicate that many systems overassociate the term sperm with sexual behavior, contraception, or pregnancy risk rather than recognizing it as a biological question. Such routing decisions are consequential, as users seeking factual information about sperm viability may instead receive guidance about sexual behavior or contraception, reducing clarity and potentially misdirecting them away from accurate, evidence-based biological information.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our evaluation shows that both model family and training data alignment play an important role in hierarchical SRH intent classification, alongside model scale. Proprietary systems still lead, but the gap narrows when models are trained on culturally aligned and domain-relevant data. Sarvam-M, an Indic-first model, emerged as the strongest Indic system and achieved performance comparable to top-performing proprietary models and the best large open-weight baselines such as Llama-3.3-70B-Instruct and Gemma-3-27B-IT. It is also important to consider that Sarvam-M is larger than the other Indic models evaluated, and its performance likely reflects a combination of increased model capacity and Indic-focused training data. While these results do not fully disentangle the effects of scale and data quality, they suggest that culturally aligned resources, rather than parameter count, are the primary drivers of performance on code-mixed intent classification.</p><p>Across the 6 best-performing models, we observed a consistent pattern of subtopic misclassification. In Pregnancy and PNC<italic>,</italic> antepartum and postpartum concerns are frequently conflated, and abortion-related queries are sometimes assigned to the &#x201C;Pregnancy Information&#x201D; subcategory. In &#x201C;Sexual and Vaginal Health,&#x201D; euphemisms such as <italic>&#x201C;</italic>safaiya or safai<italic>&#x201D;</italic> (colloquial for abortion) were misinterpreted literally as &#x201C;cleaning.&#x201D; Similarly, culturally embedded phrases such as &#x201C;bacha rukna&#x201D; (conceiving) are sometimes misinterpreted as preventing pregnancy, and &#x201C;test tube karvati hai&#x201D; (referring to IVF) is sometimes treated as sterilization or contraception [<xref ref-type="bibr" rid="ref62">62</xref>]. This reflects how health communication in rural, low-literacy settings often relies on indirect, gendered, and culturally mediated channels [<xref ref-type="bibr" rid="ref63">63</xref>]. Accounting for these cultural expressions is essential to improve classification accuracy and ensure that clinically relevant questions are not overlooked. We also observed that SRH queries could be misrouted as Cultural, Religious, or Moral Norms, even when the underlying intent is to seek health guidance or clinical information. A user query could also plausibly fit into multiple other categories, as we observed. Future work may explore culturally informed prompt design and lightweight in-context examples that expose models to common SRH euphemisms and colloquial expressions. Additionally, to address overlapping topics, researchers could explore alternative approaches that account for such ambiguity, such as ranked classification [<xref ref-type="bibr" rid="ref64">64</xref>].</p><p>In the <italic>&#x201C;</italic>Other&#x201D; category, even strong models such as Llama-3-Gaja-Hindi-8B and Gemma-3-27B-IT frequently assigned queries to new topics outside the defined hierarchy: we consider that kind of misclassification as &#x201C;Not Valid&#x201D; predictions (<xref ref-type="fig" rid="figure5">Figure 5</xref>). These cases arise when models generate unintended or unsupported labels that are not part of the hierarchical classification structure, reflecting models&#x2019; tendency toward hallucinations [<xref ref-type="bibr" rid="ref65">65</xref>] or schema misalignment [<xref ref-type="bibr" rid="ref66">66</xref>]. Our hierarchical framework requires a correct topic prediction before the subtopic evaluation. Accordingly, &#x201C;NotValid<italic>&#x201D;</italic> predictions are treated as complete classification failures and receive no credit in the hierarchical <italic>F</italic><sub>1</sub> score. Overall, these patterns indicate that models struggle to reliably map culturally and contextually grounded queries to the specified hierarchy. This limitation is essentially critical in SRH contexts, where errors at the topic level prevent meaningful subtopic interpretation and risk misrouting sensitive queries. High precision at the fine-grained intent level is paramount to ensure safe guidance and avoid reinforcing stigma.</p><p>When compared to prior chatbot efforts, our findings highlight the importance of a hierarchical classification approach. SnehAI [<xref ref-type="bibr" rid="ref15">15</xref>], an SRH-focused conversational agent deployed in India, demonstrated feasibility but relied on rule-based methods, which restrict its ability to process open-ended, code-mixed, or culturally nuanced queries. Health-Pariksha [<xref ref-type="bibr" rid="ref38">38</xref>], while not designed specifically for SRH, evaluated several LLMs on real-world health care queries, measuring the factual correctness, semantic similarity, coherence, and conciseness of the model responses. Although this approach contributed to improving trustworthiness, it did not address the challenges of intent recognition or hierarchical classification. In contrast, our framework directly targets the layered structure of SRH queries, enabling distinction between broad categories and their fine-grained subcategories. This supports classifications that are both clinically appropriate and culturally relevant. Through our evaluation, proprietary models emerged as the most reliable for hierarchical classification, but notably, Sarvam-M emerged as the second-highest performing model overall&#x2014;outscoring Claude-3.5-Sonnet and closely matching Llama-3.3-70B-Instruct. Given that Sarvam-M is open weight and free to use, it represents a promising alternative to proprietary systems, showing that with robust data and cultural adaptation, Indic models can deliver competitive performance.</p></sec><sec id="s4-2"><title>Limitations and Future Works</title><p>This study has several limitations. First, we focused on Hinglish as an illustrative case of code-mixed SRH queries, while many other languages and code-mixing patterns are used in India. Second, the hierarchical intent schema was developed for our setting and user population; researchers working in other regions should adapt label definitions to their context and target users. Third, although we evaluated many more models, several were excluded from the final set presented in this paper, either due to poor performance on code-mixed data or the cost of highly parameterized models. Fourth, our dataset is limited in size; certain topics and subtopics are underrepresented, resulting in lower statistical power for these categories, and performance estimates for these topics should be interpreted as preliminary. Fifth, our evaluation was conducted in a strict zero-shot setting, where models were not provided with the detailed subtopic definitions used by human annotators. This design allows us to assess how models infer intent from real-world, code-mixed queries without semantic scaffolding. Finally, our evaluation treats intent as a single topic, even though some queries plausibly map to more than 1 topic, motivating the exploration of alternative modeling approaches that can better capture such ambiguity.</p><p>In the future, we plan to extend our work to support additional Indian languages and code-mixing patterns, enriching both the breadth of topics and the depth of subtopic coverage in collaboration with the Myna Mahila Foundation. We also plan to evaluate in-context prompting strategies and to explore ranked classification [<xref ref-type="bibr" rid="ref67">67</xref>] and hierarchical selective classification [<xref ref-type="bibr" rid="ref64">64</xref>] approaches, which allow models to better account for ambiguity in hierarchical intent prediction.</p></sec><sec id="s4-3"><title>Implications</title><p>This study highlights the importance of hierarchical classification for SRH applications in low-resource settings. By capturing intent at both the topic and subtopic levels, this framework improves contextual precision, supports safer dialogue systems, and enables public health organizations to design targeted interventions. At the same time, our findings show that LLMs often struggle with euphemisms and sociocultural framing, highlighting the need for systems that are socially attuned as well as technically accurate. Our evaluation offers a starting benchmark that can guide the development of culturally aligned, open-source LLM tools for low-resource health care contexts. More broadly, this work contributes to the discourse on health equity in AI by addressing the linguistic and cultural barriers that limit access to reliable health information for marginalized populations.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this study, we evaluated the zero-shot performance of proprietary, open-weight multilingual, and instruction-tuned Indic LLMs on the hierarchical classification of SRH queries. Our findings reveal that the proprietary model (GPT-5) was the most reliable, while Sarvam-M emerged as the strongest Indic model and performed competitively with Claude-3.5-Sonnet and large open-weight models such as Llama-3.3-70B-Instruct. This highlights the importance of culturally aligned and domain-relevant training data alongside model scale. Our error analysis further revealed that models frequently misclassified queries that involved euphemisms, cultural or religious language, or time-sensitive concerns, underscoring the importance of capturing nuance at both topic and subtopic levels. Misclassifications in these areas risk unsafe guidance and reinforcing stigma, particularly around sensitive issues such as abortion and infertility. By introducing a benchmark evaluation framework, this study supports the development of open-source multilingual models for SRH and advances culturally aligned, socially responsive AI systems to better serve the health information needs of underserved communities.</p><p>The source code used in this study, including scripts for model evaluation, hierarchical metric calculation (h<italic>F</italic><sub>1</sub>), bootstrap resampling, and CMI calculation, is publicly available at a GitHub repository [<xref ref-type="bibr" rid="ref68">68</xref>] to support future research.</p></sec></sec></body><back><ack><p>We sincerely thank the Myna Mahila Foundation for their invaluable support throughout this study. We would also like to thank the community members who were engaged in the data collection process with the Myna Mahila Foundation, and the editor and reviewers for their thoughtful and detailed comments.</p></ack><notes><sec><title>Funding</title><p>This research was made possible with the support of the Google Award for Inclusion Research and the Agency Fund through Myna Mahila USA (SA24-03-03).</p></sec><sec><title>Data Availability</title><p>The sexual and reproductive health queries (SRHQ)-India dataset introduced in this study contains sensitive queries related to sexual and reproductive health, expressed in code-mixed Hinglish. To ensure privacy and ethical use, the dataset will not be publicly released in full. However, deidentified subsets or access to the full dataset can be provided upon reasonable request to the corresponding author for academic and noncommercial research purposes, subject to data use agreements and ethical review.</p></sec></notes><fn-group><fn fn-type="con"><p>SKD and MS conceptualized the study and designed the data framework. AT and MS performed data annotation. SKD carried out the formal analysis, developed the methodology, and created visualizations. ZM, SJ, TD, and MS provided resources for the study. AI and SJ supervised the research. The original draft was prepared by SKD, and all authors contributed to reviewing and editing the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CMI</term><def><p>Code-Mixing Index</p></def></def-item><def-item><term id="abb3">IVF</term><def><p>in-vitro fertilization</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">PNC</term><def><p>postnatal care</p></def></def-item><def-item><term id="abb6">SRH</term><def><p>sexual and reproductive health</p></def></def-item><def-item><term id="abb7">SRHQ</term><def><p>sexual and reproductive health queries</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shahsavar</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name></person-group><article-title>User intentions to use ChatGPT for self-diagnosis and health-related purposes: cross-sectional survey study</article-title><source>JMIR Hum Factors</source><year>2023</year><month>05</month><day>17</day><volume>10</volume><fpage>e47564</fpage><pub-id pub-id-type="doi">10.2196/47564</pub-id><pub-id pub-id-type="medline">37195756</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>E</surname><given-names>S</given-names> </name><name name-style="western"><surname>AS</surname><given-names>D</given-names> </name><name name-style="western"><surname>H</surname><given-names>TM</given-names> </name><name name-style="western"><surname>R</surname><given-names>K</given-names> </name><name name-style="western"><surname>E</surname><given-names>S</given-names> </name></person-group><article-title>KnowComp at semeval-2023 task 7: fine-tuning pre-trained language models for clinical trial entailment identification</article-title><access-date>2026-03-11</access-date><conf-name>Proceedings of the The 17th International Workshop on Semantic Evaluation (SemEval-2023)</conf-name><conf-date>Jul 13-14, 2023</conf-date><conf-loc>Toronto, Canada</conf-loc><fpage>1</fpage><lpage>9</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.semeval-1">https://aclanthology.org/2023.semeval-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.semeval-1.1</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Verma</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mujumdar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>ZJ</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name></person-group><article-title>Overcoming language disparity in online content classification with multimodal learning</article-title><year>2022</year><conf-name>Proceedings of the International AAAI Conference on Web and Social Media</conf-name><conf-date>Jun 6-9, 2022</conf-date><conf-loc>Atlanta, GA</conf-loc><fpage>1040</fpage><lpage>1051</lpage><pub-id pub-id-type="doi">10.1609/icwsm.v16i1.19356</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Touvron</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lavril</surname><given-names>T</given-names> </name><name name-style="western"><surname>Izacard</surname><given-names>G</given-names> </name><etal/></person-group><article-title>LLaMA: open and efficient foundation language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.13971</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>OpenAI</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>M</given-names> </name><name name-style="western"><surname>Angeli</surname><given-names>F</given-names> </name><name name-style="western"><surname>Krumeich</surname><given-names>AJSM</given-names> </name><name name-style="western"><surname>van Schayck</surname><given-names>OCP</given-names> </name></person-group><article-title>The gendered experience with respect to health-seeking behaviour in an urban slum of Kolkata, India</article-title><source>Int J Equity Health</source><year>2018</year><month>02</month><day>14</day><volume>17</volume><issue>1</issue><fpage>24</fpage><pub-id pub-id-type="doi">10.1186/s12939-018-0738-8</pub-id><pub-id pub-id-type="medline">29444674</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Deva</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ramani</surname><given-names>D</given-names> </name><name name-style="western"><surname>Divate</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jalota</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ismail</surname><given-names>A</given-names> </name></person-group><article-title>&#x201C;Kya family planning after marriage hoti hai?&#x201D;: integrating cultural sensitivity in an LLM chatbot for reproductive health</article-title><conf-name>CHI &#x2019;25: Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems</conf-name><conf-date>Apr 26 to May 1, 2025</conf-date><conf-loc>Yokohama Japan</conf-loc><fpage>1</fpage><lpage>23</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3706598">https://dl.acm.org/doi/proceedings/10.1145/3706598</ext-link></comment><pub-id pub-id-type="doi">10.1145/3706598.3713362</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Soffer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Sociodemographic biases in medical decision making by large language models</article-title><source>Nat Med</source><year>2025</year><month>06</month><volume>31</volume><issue>6</issue><fpage>1873</fpage><lpage>1881</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03626-6</pub-id><pub-id pub-id-type="medline">40195448</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Purnat</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Wilhelm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Scales</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Impacts of sexual and reproductive health and rights misinformation in digital spaces on human rights protection and promotion: scoping review</article-title><source>JMIR Infodemiology</source><year>2025</year><month>12</month><day>30</day><volume>5</volume><fpage>e83747</fpage><pub-id pub-id-type="doi">10.2196/83747</pub-id><pub-id pub-id-type="medline">41468582</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>John</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Gorman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Scales</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gorman</surname><given-names>J</given-names> </name></person-group><article-title>Online misleading information about women&#x2019;s reproductive health: a narrative review</article-title><source>J Gen Intern Med</source><year>2025</year><month>04</month><volume>40</volume><issue>5</issue><fpage>1123</fpage><lpage>1131</lpage><pub-id pub-id-type="doi">10.1007/s11606-024-09118-6</pub-id><pub-id pub-id-type="medline">39511120</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shekhar</surname><given-names>C</given-names> </name><name name-style="western"><surname>Acharya</surname><given-names>R</given-names> </name><etal/></person-group><article-title>The incidence of abortion and unintended pregnancy in India, 2015</article-title><source>Lancet Glob Health</source><year>2018</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e111</fpage><lpage>e120</lpage><pub-id pub-id-type="doi">10.1016/S2214-109X(17)30453-9</pub-id><pub-id pub-id-type="medline">29241602</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Garg</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Misra</surname><given-names>A</given-names> </name><name name-style="western"><surname>Seth</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chakraborty</surname><given-names>T</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Larson</surname><given-names>K</given-names> </name></person-group><article-title>SUKHSANDESH: an avatar therapeutic question answering platform for sexual education in rural india</article-title><conf-name>Proceedings of the Thirty-Third International Joint Conference on Artificial Intelligence (IJCAI-24)</conf-name><conf-date>Aug 3-9, 2024</conf-date><conf-loc>Jeju, South Korea</conf-loc><fpage>7465</fpage><lpage>7473</lpage><pub-id pub-id-type="doi">10.24963/ijcai.2024/826</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>D</given-names> </name></person-group><article-title>Complications during pregnancy among Indian tribal women: a mini-review</article-title><source>JMMS</source><year>2025</year><volume>2</volume><issue>2</issue><fpage>128</fpage><lpage>131</lpage><pub-id pub-id-type="doi">10.51219/JMMS/Kumar-D/25</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joyce</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mukherji</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nandi</surname><given-names>A</given-names> </name></person-group><article-title>Socioeconomic inequalities in adverse pregnancy outcomes in India: 2004-2019</article-title><source>PLOS Glob Public Health</source><year>2024</year><volume>4</volume><issue>9</issue><fpage>e0003701</fpage><pub-id pub-id-type="doi">10.1371/journal.pgph.0003701</pub-id><pub-id pub-id-type="medline">39292712</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singhal</surname><given-names>A</given-names> </name><etal/></person-group><article-title>An artificial intelligence chatbot for young people&#x2019;s sexual and reproductive health in India (SnehAI): instrumental case study</article-title><source>J Med Internet Res</source><year>2022</year><month>01</month><day>3</day><volume>24</volume><issue>1</issue><fpage>e29969</fpage><pub-id pub-id-type="doi">10.2196/29969</pub-id><pub-id pub-id-type="medline">34982034</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Patwa</surname><given-names>P</given-names> </name><name name-style="western"><surname>Aguilar</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kar</surname><given-names>S</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Herbelot</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Palmer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>N</given-names> </name><name name-style="western"><surname>May</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shutova</surname><given-names>E</given-names> </name></person-group><article-title>SemEval-2020 task 9: overview of sentiment analysis of code-mixed tweets</article-title><conf-name>Proceedings of the Fourteenth Workshop on Semantic Evaluation</conf-name><conf-date>Dec 12-13, 2020</conf-date><conf-loc>Barcelona (online)</conf-loc><fpage>774</fpage><lpage>790</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.semeval-1">https://aclanthology.org/2020.semeval-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.semeval-1.100</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parshad</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Bhowmick</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chand</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kumari</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>N</given-names> </name></person-group><article-title>What is India speaking? Exploring the &#x201C;Hinglish&#x201D; invasion</article-title><source>Phys A: Stat Mech Appl</source><year>2016</year><month>05</month><volume>449</volume><fpage>375</fpage><lpage>389</lpage><pub-id pub-id-type="doi">10.1016/j.physa.2016.01.015</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sengupta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Das</surname><given-names>S</given-names> </name><name name-style="western"><surname>Akhtar</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Chakraborty</surname><given-names>T</given-names> </name></person-group><article-title>Social, economic, and demographic factors drive the emergence of Hinglish code-mixing on social media</article-title><source>Humanit Soc Sci Commun</source><year>2024</year><volume>11</volume><issue>1</issue><fpage>606</fpage><pub-id pub-id-type="doi">10.1057/s41599-024-03058-6</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Heredia</surname><given-names>M</given-names> </name><name name-style="western"><surname>Labaka</surname><given-names>G</given-names> </name><name name-style="western"><surname>Barnes</surname><given-names>J</given-names> </name><name name-style="western"><surname>Soroa</surname><given-names>A</given-names> </name></person-group><article-title>Conditioning llms to generate code-switched text</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 18, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.12924</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yong</surname><given-names>ZX</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Forde</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Prompting multilingual large language models to generate code-mixed texts: the case of south east asian languages</article-title><access-date>2026-02-02</access-date><conf-name>Proceedings of the 6th Workshop on Computational Approaches to Linguistic Code-Switching</conf-name><conf-date>Dec 6-10, 2023</conf-date><conf-loc>Singapore</conf-loc><fpage>43</fpage><lpage>63</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.calcs-1">https://aclanthology.org/2023.calcs-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.calcs-1.5</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moskovitch</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cohen-Kashi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dror</surname><given-names>U</given-names> </name><name name-style="western"><surname>Levy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Maimon</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shahar</surname><given-names>Y</given-names> </name></person-group><article-title>Multiple hierarchical classification of free-text clinical guidelines</article-title><source>Artif Intell Med</source><year>2006</year><month>07</month><volume>37</volume><issue>3</issue><fpage>177</fpage><lpage>190</lpage><pub-id pub-id-type="doi">10.1016/j.artmed.2006.04.001</pub-id><pub-id pub-id-type="medline">16730962</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ee-Peng</surname><given-names>L</given-names> </name></person-group><article-title>Hierarchical text classification and evaluation</article-title><conf-name>2001 IEEE International Conference on Data Mining</conf-name><conf-date>Nov 29 to Dec 2, 2001</conf-date><conf-loc>San Jose, CA</conf-loc><fpage>521</fpage><lpage>528</lpage><pub-id pub-id-type="doi">10.1109/ICDM.2001.989560</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kosmopoulos</surname><given-names>A</given-names> </name><name name-style="western"><surname>Partalas</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gaussier</surname><given-names>E</given-names> </name><name name-style="western"><surname>Paliouras</surname><given-names>G</given-names> </name><name name-style="western"><surname>Androutsopoulos</surname><given-names>I</given-names> </name></person-group><article-title>Evaluation measures for hierarchical classification: a unified view and novel approaches</article-title><source>Data Min Knowl Discov</source><year>2015</year><month>05</month><volume>29</volume><issue>3</issue><fpage>820</fpage><lpage>865</lpage><pub-id pub-id-type="doi">10.1007/s10618-014-0382-x</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>B</given-names> </name><name name-style="western"><surname>Nag</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Hierarchical query classification in e-commerce search</article-title><access-date>2026-03-11</access-date><conf-name>WWW &#x2019;24: Companion Proceedings of the ACM Web Conference 2024</conf-name><conf-date>May 13-17, 2024</conf-date><conf-loc>Singapore</conf-loc><fpage>338</fpage><lpage>345</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3589335">https://dl.acm.org/doi/proceedings/10.1145/3589335</ext-link></comment><pub-id pub-id-type="doi">10.1145/3589335.3648332</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Cao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Context-aware query classification</article-title><access-date>2026-03-11</access-date><conf-name>SIGIR &#x2019;09: Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval</conf-name><conf-date>Jul 19-23, 2009</conf-date><conf-loc>Boston, MA</conf-loc><fpage>3</fpage><lpage>10</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/1571941">https://dl.acm.org/doi/proceedings/10.1145/1571941</ext-link></comment><pub-id pub-id-type="doi">10.1145/1571941.1571945</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ahuja</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>W</given-names> </name><name name-style="western"><surname>Reddy</surname><given-names>CK</given-names> </name></person-group><article-title>A hierarchical attention retrieval model for healthcare question answering</article-title><access-date>2026-03-11</access-date><conf-name>WWW &#x2019;19: The World Wide Web Conference</conf-name><conf-date>May 13-17, 2019</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>2472</fpage><lpage>2482</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3308558">https://dl.acm.org/doi/proceedings/10.1145/3308558</ext-link></comment><pub-id pub-id-type="doi">10.1145/3308558.3313699</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Denecke</surname><given-names>K</given-names> </name></person-group><article-title>Classification of user queries according to a hierarchical medical procedure encoding system using an ensemble classifier</article-title><source>Front Artif Intell</source><year>2022</year><volume>5</volume><fpage>1000283</fpage><pub-id pub-id-type="doi">10.3389/frai.2022.1000283</pub-id><pub-id pub-id-type="medline">36406473</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sosnowski</surname><given-names>&#x0141;</given-names> </name><name name-style="western"><surname>&#x017B;u&#x0142;awi&#x0144;ska</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Szymusik</surname><given-names>I</given-names> </name><name name-style="western"><surname>Zygu&#x0142;a</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bambul-Mazurek</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence in personalized healthcare analysis for womens&#x2019; menstrual health disorders</article-title><conf-name>2022 17th Conference on Computer Science and Intelligence Systems (FedCSIS)</conf-name><conf-date>Sep 4-7, 2022</conf-date><conf-loc>Sofia, Bulgaria</conf-loc><fpage>751</fpage><lpage>760</lpage><pub-id pub-id-type="doi">10.15439/2022F59</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bouadjenek</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Naseem</surname><given-names>U</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Rambow</surname><given-names>O</given-names> </name><name name-style="western"><surname>Wanner</surname><given-names>L</given-names> </name><name name-style="western"><surname>Apidianaki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Al-Khalifa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Eugenio</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Schockaert</surname><given-names>S</given-names> </name></person-group><article-title>Leveraging taxonomy and LLMs for improved multimodal hierarchical classification</article-title><access-date>2025-09-14</access-date><conf-name>Proceedings of the 31st International Conference on Computational Linguistics Association for Computational Linguistics</conf-name><conf-date>Jan 19-24, 2025</conf-date><conf-loc>Abu Dhabi, UAE</conf-loc><fpage>6244</fpage><lpage>6254</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.coling-main.417/">https://aclanthology.org/2025.coling-main.417/</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>GPT-5 system card</article-title><source>OpenAI</source><access-date>2025-09-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/gpt-5-system-card.pdf">https://cdn.openai.com/gpt-5-system-card.pdf</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>OpenAI</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><etal/></person-group><article-title>GPT-4o system card</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21276</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Grattafiori</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kamath</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ferret</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Gemma 3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 25, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.19786</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ni</surname><given-names>C</given-names> </name><etal/></person-group><article-title>A systematic review of chatgpt and other conversational large language models in healthcare</article-title><source>medRxiv</source><comment>Preprint posted online on  Apr 27, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.04.26.24306390</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>W</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Oh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Naumann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Globerson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saenko</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hardt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Levine</surname><given-names>S</given-names> </name></person-group><article-title>Amazon-M2: a multilingual multi-locale shopping session dataset for recommendation and text generation</article-title><access-date>2026-03-11</access-date><conf-name>NIPS &#x2019;23: Proceedings of the 37th International Conference on Neural Information Processing Systems</conf-name><conf-date>Dec 10-16, 2023</conf-date><conf-loc>New Orleans, LA</conf-loc><fpage>8006</fpage><lpage>8026</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2023/file/193df57a2366d032fb18dcac0698d09a-Paper-Datasets_and_Benchmarks.pdf">https://proceedings.neurips.cc/paper_files/paper/2023/file/193df57a2366d032fb18dcac0698d09a-Paper-Datasets_and_Benchmarks.pdf</ext-link></comment><pub-id pub-id-type="doi">10.52202/075280-0351</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chandra</surname><given-names>M</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>De Choudhury</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name></person-group><article-title>Better to ask in English: cross-lingual evaluation of large language models for healthcare queries</article-title><access-date>2026-03-11</access-date><conf-name>WWW &#x2019;24: Proceedings of the ACM Web Conference 2024</conf-name><conf-date>May 13-17, 2024</conf-date><conf-loc>Singapore</conf-loc><fpage>2627</fpage><lpage>2638</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/3589334">https://dl.acm.org/doi/proceedings/10.1145/3589334</ext-link></comment><pub-id pub-id-type="doi">10.1145/3589334.3645643</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tripathi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sekher</surname><given-names>TV</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Murthy</surname><given-names>AK</given-names> </name></person-group><article-title>Youth in India ready for sex education? Emerging evidence from national surveys</article-title><source>PLoS ONE</source><year>2013</year><volume>8</volume><issue>8</issue><fpage>e71584</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0071584</pub-id><pub-id pub-id-type="medline">23951197</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gumma</surname><given-names>V</given-names> </name><name name-style="western"><surname>Raghunath</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sitaram</surname><given-names>S</given-names> </name></person-group><article-title>HEALTH-PARIKSHA: assessing RAG models for HEALTH chatbots in real-world multilingual settings</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.13671</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>HealthPrompt: a zero-shot learning paradigm for clinical natural language processing</article-title><source>AMIA Annu Symp Proc</source><year>2022</year><volume>2022</volume><fpage>972</fpage><lpage>981</lpage><pub-id pub-id-type="medline">37128372</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ovadje</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Garadi</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Sarker</surname><given-names>A</given-names> </name></person-group><article-title>Evaluating large language models for health-related text classification tasks with public social media data</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>10</month><day>1</day><volume>31</volume><issue>10</issue><fpage>2181</fpage><lpage>2189</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae210</pub-id><pub-id pub-id-type="medline">39121174</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lefever</surname><given-names>E</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Solorio</surname><given-names>T</given-names> </name><name name-style="western"><surname>Choudhury</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bali</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sitaram</surname><given-names>S</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Diab</surname><given-names>M</given-names> </name></person-group><article-title>Sentiment analysis for hinglish code-mixed tweets by means of cross-lingual word embeddings</article-title><access-date>2025-12-17</access-date><conf-name>Proceedings of the 4th Workshop on Computational Approaches to Code Switching European Language Resources Association</conf-name><conf-date>May 11-16, 2020</conf-date><conf-loc>Marseille, France</conf-loc><fpage>45</fpage><lpage>51</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.calcs-1.6/">https://aclanthology.org/2020.calcs-1.6/</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chanda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mishra</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pal</surname><given-names>S</given-names> </name></person-group><article-title>Sentiment analysis of code-mixed Dravidian languages leveraging pretrained model and word-level language tag</article-title><source>Nat lang process</source><year>2025</year><month>03</month><volume>31</volume><issue>2</issue><fpage>477</fpage><lpage>499</lpage><pub-id pub-id-type="doi">10.1017/nlp.2024.30</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gamb&#x00E4;ck</surname><given-names>B</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Calzolari</surname><given-names>N</given-names> </name><name name-style="western"><surname>Choukri</surname><given-names>K</given-names> </name><name name-style="western"><surname>Declerck</surname><given-names>T</given-names> </name></person-group><article-title>Comparing the level of code-switching in corpora</article-title><conf-name>Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC&#x2019;16) European Language Resources Association (ELRA)</conf-name><conf-date>May 23-28, 2016</conf-date><conf-loc>Portoro&#x017E;, Slovenia</conf-loc><fpage>1850</fpage><lpage>1855</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/L16-1292/">https://aclanthology.org/L16-1292/</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Joulin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Grave</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bojanowski</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name></person-group><article-title>Bag of tricks for efficient text classification</article-title><access-date>2025-02-02</access-date><conf-name>Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers</conf-name><conf-date>Apr 3-7, 2017</conf-date><conf-loc>Valencia, Spain</conf-loc><fpage>427</fpage><lpage>431</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://aclweb.org/anthology/E17-2">http://aclweb.org/anthology/E17-2</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/E17-2068</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ba&#x00F1;&#x00F3;n</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ram&#x00ED;rez-S&#x00E1;nchez</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zaragoza-Bernabeu</surname><given-names>J</given-names> </name><name name-style="western"><surname>FastSpell</surname><given-names>ORS</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Calzolari</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kan</surname><given-names>MY</given-names> </name><name name-style="western"><surname>Hoste</surname><given-names>V</given-names> </name><name name-style="western"><surname>Lenci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sakti</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xue</surname><given-names>N</given-names> </name></person-group><article-title>The langid magic spell</article-title><access-date>2026-03-11</access-date><conf-name>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024) ELRA and ICCL</conf-name><conf-date>May 20-25, 2024</conf-date><conf-loc>Torino, Italia</conf-loc><fpage>7133</fpage><lpage>7140</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.lrec-main.626/">https://aclanthology.org/2024.lrec-main.626/</ext-link></comment><pub-id pub-id-type="doi">10.63317/3w889497us8n</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ibrahim</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Sarveswaran</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vaidya</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krishna Bal</surname><given-names>B</given-names> </name><name name-style="western"><surname>Shams</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thapa</surname><given-names>S</given-names> </name></person-group><article-title>CUFE@NLU of devanagari script languages 2025: language identification using fasttext</article-title><access-date>2026-03-11</access-date><conf-name>Proceedings of the First Workshop on Challenges in Processing South Asian Languages</conf-name><conf-date>Jan 19-24, 2025</conf-date><conf-loc>Abu Dhabi, UAE</conf-loc><fpage>273</fpage><lpage>277</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.chipsal-1.30/">https://aclanthology.org/2025.chipsal-1.30/</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Viera</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Garrett</surname><given-names>JM</given-names> </name></person-group><article-title>Understanding interobserver agreement: the kappa statistic</article-title><source>Fam Med</source><year>2005</year><month>05</month><volume>37</volume><issue>5</issue><fpage>360</fpage><lpage>363</lpage><pub-id pub-id-type="medline">15883903</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>AQ</given-names> </name><name name-style="western"><surname>Sablayrolles</surname><given-names>A</given-names> </name><name name-style="western"><surname>Roux</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Mixtral of experts</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 8, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.04088</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Riviere</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gemma 2: improving open language models at a practical size</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2408.00118</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Qwen</surname><given-names>YA</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen2.5 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.15115</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>D&#x2019;souza</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Aya expanse: combining research breakthroughs for a new multilingual frontier</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 5, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.04261</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Gala</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jayakumar</surname><given-names>T</given-names> </name><name name-style="western"><surname>Husain</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Airavata: introducing hindi instruction-tuned LLM</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 26, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.15006</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="web"><article-title>GenVRadmin/aryabhatta-gemmagenz-vikas-merged</article-title><source>Hugging Face</source><access-date>2026-03-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/GenVRadmin/AryaBhatta-GemmaGenZ-Vikas-Merged">https://huggingface.co/GenVRadmin/AryaBhatta-GemmaGenZ-Vikas-Merged</ext-link></comment></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="web"><article-title>Cognitive-lab/llama3-gaja-hindi-8B-v01hugging face</article-title><source>Hugging Face</source><access-date>2025-10-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://huggingface.co/Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1">https://huggingface.co/Cognitive-Lab/LLama3-Gaja-Hindi-8B-v0.1</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kallappa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kamble</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ravi</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Krutrim LLM: multilingual foundational model for over a billion people</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 10, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.09642</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="web"><article-title>Claude 3.5 Sonnet</article-title><source>Anthropic</source><access-date>2025-09-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-5-sonnet">https://www.anthropic.com/news/claude-3-5-sonnet</ext-link></comment></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="web"><article-title>Sarvam-M</article-title><source>Sarvam</source><access-date>2025-09-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.sarvam.ai/blogs/sarvam-m">https://www.sarvam.ai/blogs/sarvam-m</ext-link></comment></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>R</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Al-Onaizan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bansal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>YN</given-names> </name></person-group><article-title>Unveiling the lexical sensitivity of llms: combinatorial optimization for prompt enhancement</article-title><access-date>2026-03-11</access-date><conf-name>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 12-16, 2024</conf-date><conf-loc>Miami, FL</conf-loc><fpage>5128</fpage><lpage>5154</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2024.emnlp-main">https://aclanthology.org/2024.emnlp-main</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2024.emnlp-main.295</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhattacharya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bashar</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name></person-group><article-title>So near, yet so far: access to safe abortion services remains elusive for poor women in India</article-title><source>BMJ Case Rep</source><year>2017</year><month>10</month><day>13</day><volume>2017</volume><fpage>bcr2017220980</fpage><pub-id pub-id-type="doi">10.1136/bcr-2017-220980</pub-id><pub-id pub-id-type="medline">29030364</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roberts</surname><given-names>L</given-names> </name><name name-style="western"><surname>Renati</surname><given-names>S</given-names> </name><name name-style="western"><surname>Solomon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Montgomery</surname><given-names>S</given-names> </name></person-group><article-title>Women and infertility in a pronatalist culture: mental health in the slums of Mumbai</article-title><source>Int J Womens Health</source><year>2020</year><volume>12</volume><fpage>993</fpage><lpage>1003</lpage><pub-id pub-id-type="doi">10.2147/IJWH.S273149</pub-id><pub-id pub-id-type="medline">33192102</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Unisa</surname><given-names>S</given-names> </name></person-group><article-title>Addressing reproductive health knowledge, infertility and coping strategies among rural women in India</article-title><source>J Biosoc Sci</source><year>2021</year><month>07</month><volume>53</volume><issue>4</issue><fpage>557</fpage><lpage>565</lpage><pub-id pub-id-type="doi">10.1017/S0021932020000371</pub-id><pub-id pub-id-type="medline">32677598</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Husain</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ghosh</surname><given-names>S</given-names> </name></person-group><source>Contraceptive use among illiterate women in India: does proximate illiteracy matter?</source><year>2011</year><access-date>2025-10-16</access-date><publisher-name>MPRA Pap</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://ideas.repec.org//p/pra/mprapa/30790.html">https://ideas.repec.org//p/pra/mprapa/30790.html</ext-link></comment></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>El-Yaniv</surname><given-names>R</given-names> </name><name name-style="western"><surname>Galil</surname><given-names>I</given-names> </name><name name-style="western"><surname>Goren</surname><given-names>S</given-names> </name></person-group><article-title>Hierarchical selective classification</article-title><access-date>2026-03-11</access-date><conf-name>Advances in Neural Information Processing Systems 37</conf-name><conf-date>Dec 10-15, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="http://www.proceedings.com/79017.html">http://www.proceedings.com/79017.html</ext-link></comment><pub-id pub-id-type="doi">10.52202/079017-3526</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roustan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bastardot</surname><given-names>F</given-names> </name></person-group><article-title>The clinicians&#x2019; guide to large language models: a general perspective with a focus on hallucinations</article-title><source>Interact J Med Res</source><year>2025</year><month>01</month><day>28</day><volume>14</volume><fpage>e59823</fpage><pub-id pub-id-type="doi">10.2196/59823</pub-id><pub-id pub-id-type="medline">39874574</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>G</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name></person-group><article-title>Alleviating the fear of losing alignment in LLM fine-tuning</article-title><conf-name>2025 IEEE Symposium on Security and Privacy (SP)</conf-name><conf-date>May 12-14, 2025</conf-date><conf-loc>San Francisco, CA</conf-loc><fpage>2152</fpage><lpage>2170</lpage><pub-id pub-id-type="doi">10.1109/SP61157.2025.00171</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Atapour-Abarghouei</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bonner</surname><given-names>S</given-names> </name><name name-style="western"><surname>McGough</surname><given-names>AS</given-names> </name></person-group><article-title>Rank over class: the untapped potential of ranking in natural language processing</article-title><conf-name>2021 IEEE International Conference on Big Data (Big Data)</conf-name><conf-date>Dec 15-18, 2021</conf-date><conf-loc>Orlando, FL</conf-loc><fpage>3950</fpage><lpage>3959</lpage><pub-id pub-id-type="doi">10.1109/BigData52589.2021.9671386</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="web"><article-title>EmoryCareLab/hierarchical-srh-intent</article-title><source>GitHub</source><year>2026</year><access-date>2026-02-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/EmoryCareLab/hierarchical-srh-intent">https://github.com/EmoryCareLab/hierarchical-srh-intent</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Annotation guidelines for hierarchical topic-subtopic labeling of the sexual and reproductive health queries (SRHQ-India dataset).</p><media xlink:href="jmir_v28i1e86545_app1.docx" xlink:title="DOCX File, 46 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Prompt template and statistical performance analysis of zero-shot hierarchical classification performance across open-weight, Indic, and proprietary large language models (LLMs), including a comparative analysis of GPT-5 and GPT-4o on representative user queries.</p><media xlink:href="jmir_v28i1e86545_app2.docx" xlink:title="DOCX File, 36 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Model-generated reasons for predicted hierarchical classifications across representative sexual and reproductive health (SRH) queries, reported for all evaluated models.</p><media xlink:href="jmir_v28i1e86545_app3.docx" xlink:title="DOCX File, 38 KB"/></supplementary-material></app-group></back></article>