diff --git a/evaluation/datasets/webvoyager_outdated_tasks.jsonl b/evaluation/datasets/webvoyager_outdated_tasks.jsonl new file mode 100644 index 00000000..aff42bdc --- /dev/null +++ b/evaluation/datasets/webvoyager_outdated_tasks.jsonl @@ -0,0 +1,8 @@ +{"web_name": "Apple", "id": "Apple--7", "ques": "When and where the Apple Vision Pro will be released.", "web": "https://www.apple.com/"} +{"web_name": "ArXiv", "id": "ArXiv--24", "ques": "Identify the most recent paper related to 'graph neural networks' on ArXiv and determine the affiliation of the first author.", "web": "https://arxiv.org/"} +{"web_name": "BBC News", "id": "BBC News--28", "ques": "Find the Market Data section on BBC News and tell me which company the data comes from.", "web": "https://www.bbc.com/news/"} +{"web_name": "GitHub", "id": "GitHub--20", "ques": "Open GitHub Copilot's FAQs to find the official answer to when Copilot chat can be used on mobile.", "web": "https://github.com/"} +{"web_name": "Huggingface", "id": "Huggingface--10", "ques": "Open space: argilla/notux-chat-ui and interact with it by asking it 'which team trained you'. What is its answer.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--20", "ques": "Locate a pre-trained natural language processing model on Hugging Face that specializes in named entity recognition (NER), confirm that the model was last updated in 2022 and has 1M+ downloads.", "web": "https://huggingface.co/"} +{"web_name": "Huggingface", "id": "Huggingface--34", "ques": "List the benefits of hugging face classroom mentioned on Hugging face website.", "web": "https://huggingface.co/"} +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--45", "ques": "A 175cm tall, 85kg, 40yo man climbs 2500 steps at about 18cm per step and 40 steps per minute. summarise the Metabolic properties.", "web": "https://www.wolframalpha.com/"} \ No newline at end of file diff --git a/evaluation/datasets/webvoyager_reference_answer.json b/evaluation/datasets/webvoyager_reference_answer.json index 03c2ac6a..c01a93fa 100644 --- a/evaluation/datasets/webvoyager_reference_answer.json +++ b/evaluation/datasets/webvoyager_reference_answer.json @@ -35,7 +35,7 @@ { "id": 6, "type": "possible", - "ans": "'Spinach Lasagna', 4.7-star, 501 reviews" + "ans": "'', 4.0+ star, 500+ reviews" }, { "id": 7, @@ -90,12 +90,12 @@ { "id": 17, "type": "golden", - "ans": "Easy to make and very delicious" + "ans": "" }, { "id": 18, "type": "possible", - "ans": "'Eggplant Lasagna', 4.7-star, 305 reviews" + "ans": "'', 4.5+ star, 300+ reviews" }, { "id": 19, @@ -285,7 +285,7 @@ { "id": 10, "type": "possible", - "ans": "Sony Playstation PS4 1TB Black Console; 2-Year Protection for $30.99" + "ans": "Sony Playstation PS4; 2-Year Protection for $20 to $40" }, { "id": 11, @@ -295,7 +295,7 @@ { "id": 12, "type": "possible", - "ans": "Worth every penny" + "ans": "" }, { "id": 13, @@ -455,7 +455,7 @@ { "id": 2, "type": "possible", - "ans": "14 Pro: Available at authorized resellers, A16 Bionic chip, 6-core CPU, 5-core GPU, 16-core Neural Engine; 15 Pro: Starting at $999, A17 Pro chip, 6-core CPU, 6-core GPU, 16-core Neural Engine" + "ans": "16 Pro:, ; 16: , " }, { "id": 3, @@ -470,7 +470,7 @@ { "id": 5, "type": "possible", - "ans": "iPhone 15 ($799) or pro ($999) or pro Max ($1199); September 22, 2023" + "ans": ", " }, { "id": 6, @@ -500,7 +500,7 @@ { "id": 11, "type": "possible", - "ans": "sixth-generation iPad Pro 11\u2011inch, iPad Pro 12.9\u2011inch; release date: October 26, 2022; base storage capacity 128 GB, starting price $799" + "ans": ", , " }, { "id": 12, @@ -525,7 +525,7 @@ { "id": 16, "type": "possible", - "ans": "2 types, price difference $10" + "ans": "2 types, price difference $50" }, { "id": 17, @@ -540,7 +540,7 @@ { "id": 19, "type": "golden", - "ans": "If you can dream it, Mac can do it; Mind-blowing. Head-turning" + "ans": "; " }, { "id": 20, @@ -550,17 +550,17 @@ { "id": 21, "type": "possible", - "ans": "128GB, 256GB, 512GB, 1TB, and 2TB" + "ans": "256GB, 512GB, 1TB, and 2TB" }, { "id": 22, "type": "possible", - "ans": "iPhone 13 Pro Max, Up to $500" + "ans": "iPhone 13 Pro Max, $3000 to $500" }, { "id": 23, "type": "possible", - "ans": "Apple Watch SE From $249, Apple Watch Series 9 From $399" + "ans": "Apple Watch SE From $249, Apple Watch Series 10 From $399" }, { "id": 24, @@ -575,12 +575,12 @@ { "id": 26, "type": "possible", - "ans": "4K video recording at 24 fps, 25 fps, 30 fps, or 60 fps" + "ans": "4K video recording at 60 fps" }, { "id": 27, "type": "possible", - "ans": "Available in multiple colors: Space Gray, Blue, Yellow, White, and Orange." + "ans": "Available in multiple colors: Midnight, Blue, Yellow, White, and Orange." }, { "id": 28, @@ -595,7 +595,7 @@ { "id": 30, "type": "possible", - "ans": "11-inch, 128GB from $799, 256GB from $899, 512GB from $1099, 1TB from $1499, and 2TB from $1899." + "ans": "11-inch, 256GB from , 512GB from , 1TB from , and 2TB from " }, { "id": 31, @@ -725,7 +725,7 @@ { "id": 12, "type": "golden", - "ans": "3" + "ans": "3 to 5" }, { "id": 13, @@ -735,7 +735,7 @@ { "id": 14, "type": "golden", - "ans": "3" + "ans": "4" }, { "id": 15, @@ -810,7 +810,7 @@ { "id": 29, "type": "possible", - "ans": "240+ (search by title)" + "ans": "212 (search by title)" }, { "id": 30, @@ -820,7 +820,7 @@ { "id": 31, "type": "golden", - "ans": "7 papers" + "ans": "2 papers" }, { "id": 32, @@ -895,7 +895,7 @@ { "id": 2, "type": "possible", - "ans": "
(within the last 2 days)" + "ans": "
(within the last 2 months)" }, { "id": 3, @@ -945,7 +945,7 @@ { "id": 12, "type": "possible", - "ans": "ramen, Tokyo" + "ans": ", " }, { "id": 13, @@ -1630,7 +1630,7 @@ { "id": 17, "type": "possible", - "ans": "23" + "ans": "30 to 50" }, { "id": 18, @@ -1690,7 +1690,7 @@ { "id": 29, "type": "possible", - "ans": "$399/year, discount: 59 / month * 12 - 399 = 309; Google, IBM, and Imperial College London ..." + "ans": "; Google, Microsoft, and IBM..." }, { "id": 30, @@ -1700,7 +1700,7 @@ { "id": 31, "type": "possible", - "ans": "52.6%" + "ans": "56.81%" }, { "id": 32, @@ -1805,7 +1805,7 @@ { "id": 9, "type": "golden", - "ans": "10 teams have Los Angeles in their name; 2 teams are NBA" + "ans": "9 teams have Los Angeles in their name; 1 teams are NBA" }, { "id": 10, @@ -1845,7 +1845,7 @@ { "id": 17, "type": "golden", - "ans": "Boston Celtics; San Antonio Spurs" + "ans": "; " }, { "id": 18, @@ -1855,7 +1855,7 @@ { "id": 19, "type": "golden", - "ans": "Jrue Holiday" + "ans": "Jaylen Brown" }, { "id": 20, @@ -1895,7 +1895,7 @@ { "id": 27, "type": "golden", - "ans": "30 teams in search results, 1 team Vegas Golden Knights (NHL)" + "ans": "31 teams in search results, 1 team Vegas Golden Knights (NHL)" }, { "id": 28, @@ -1915,7 +1915,7 @@ { "id": 31, "type": "golden", - "ans": "Carlos Rodon, 255 lbs" + "ans": "Paul Goldschmidt, 225 lbs" }, { "id": 32, @@ -1945,7 +1945,7 @@ { "id": 37, "type": "possible", - "ans": "1471" + "ans": "1471 or more" }, { "id": 38, @@ -1975,7 +1975,7 @@ { "id": 43, "type": "possible", - "ans": "espnW Rankings Class of 2023, Judea Watkins from USC, Mikaylah Williams from LSU, Jadyn Donovan from Duke" + "ans": "espnW Rankings Class of 2025, Aaliyah Chavez, Sienna Betts from UCLA, Jasmine Davidson from USC" } ] }, @@ -2055,7 +2055,7 @@ { "id": 14, "type": "possible", - "ans": "bpasero; jrieken; mjbvz" + "ans": "bpasero; jrieken; Tyriar" }, { "id": 15, @@ -2150,7 +2150,7 @@ { "id": 33, "type": "possible", - "ans": "Philips builds and deploys digital health technology faster with innersource on GitHub. Shopify keeps pushing eCommerce forward with help from GitHub tools." + "ans": "Duolingo boosts developer speed by 25% with GitHub Copilot. " }, { "id": 34, @@ -2175,7 +2175,7 @@ { "id": 38, "type": "golden", - "ans": "WerWolv/ImHex" + "ans": "sherlock-project/sherlock" }, { "id": 39, @@ -2685,7 +2685,7 @@ { "id": 13, "type": "possible", - "ans": "Strange Planet, 2023" + "ans": "iCarly Reboot" }, { "id": 14, @@ -2750,7 +2750,7 @@ { "id": 26, "type": "possible", - "ans": "RAM 8 GB; Processor: Multicore Intel\u00ae or Apple Silicon processor (2 GHz or faster processor with SSE 4.2 or later) with 64-bit support; Operating system, macOS Big Sur (version 11.0) or later; Graphics card, GPU with Metal support, 1.5 GB of GPU memory ..." + "ans": " (Mac) ..." }, { "id": 27, @@ -2780,7 +2780,7 @@ { "id": 32, "type": "possible", - "ans": "Manchester City Football Club; June 10, 2023; Atat\u00fcrk Olympic Stadium, Istanbul, Turkey" + "ans": "The final was held in London on June 1, 2024, and the winner was Real Madrid." }, { "id": 33, @@ -2810,7 +2810,7 @@ { "id": 38, "type": "possible", - "ans": "next: April 8, 2024. The one after that will take place on August 23, 2044." + "ans": "next: August 23, 2044. The one after that will take place on August 12, 2045." }, { "id": 39, @@ -2970,7 +2970,7 @@ { "id": 26, "type": "possible", - "ans": "PhilipTheGreat/DiabloGPT-small-Traveller, GPT2LMHeadModel, 510 MB" + "ans": " (generating travel chats), , , " }, { "id": 27, diff --git a/evaluation/datasets/webvoyager_tasks.jsonl b/evaluation/datasets/webvoyager_tasks.jsonl index 7ba2ebd2..45c93017 100644 --- a/evaluation/datasets/webvoyager_tasks.jsonl +++ b/evaluation/datasets/webvoyager_tasks.jsonl @@ -14,10 +14,10 @@ {"web_name": "Allrecipes", "id": "Allrecipes--13", "ques": "Find a recipe with over 100 reviews for Fried Fish on Allrecipes, list the Full Nutrition Label and tell me the amount of Iron per Serving.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--14", "ques": "Search for a recipe that includes \"chicken breast\" and \"quinoa\" with preparation time under 30 minutes on Allrecipes.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--15", "ques": "Choose a dessert recipe on Allrecipes with a prep time of less than 30 minutes, has chocolate as an ingredient, and has a user rating of 4 stars or higher. Provide the name of the recipe, ingredients list, and step-by-step instructions.", "web": "https://www.allrecipes.com/"} -{"web_name": "Allrecipes", "id": "Allrecipes--16", "ques": "Find a five-star rated chocolate chip cookie recipe that takes less than 1 hour to make on Allrecipes. Note how many reviews the recipe has and the main ingredients required.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--16", "ques": "Find a chocolate chip cookie recipe that has at least a 4.5 stars rating and takes less than 1 hour to make on Allrecipes. Note how many reviews the recipe has and the main ingredients required.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--17", "ques": "Find the Easy Vegetarian Spinach Lasagna recipe on Allrecipes and tell me what the latest review says.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--18", "ques": "Find a recipe for a vegetarian lasagna that has over 300 reviews and an average rating of 4.5 or higher on Allrecipes.", "web": "https://www.allrecipes.com/"} -{"web_name": "Allrecipes", "id": "Allrecipes--19", "ques": "Find a vegan lasagna recipe on Allrecipes that requires 10 ingredients or less and has feedback of more than 200 reviews. Provide a brief overview of the ingredient list and the total prep and cook time.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--19", "ques": "Find a vegan lasagna recipe on Allrecipes that requires 10 ingredients or less and has feedback of more than 15 reviews. Provide a brief overview of the ingredient list and the total prep and cook time.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--20", "ques": "Find a recipe for a cauliflower pizza crust that has a preparation time of under 30 minutes and a rating of at least 4 stars on Allrecipes. Include the number of calories per serving.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--21", "ques": "Locate a high-rated recipe for gluten-free brownies on Allrecipes with at least 50 reviews. List the main ingredients and the total time required for preparation and cooking.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--22", "ques": "Find a recipe for a healthy avocado salad on Allrecipes that has a preparation time of less than 20 minutes and more than 30 user reviews. Include the nutritional information per serving.", "web": "https://www.allrecipes.com/"} @@ -29,7 +29,7 @@ {"web_name": "Allrecipes", "id": "Allrecipes--28", "ques": "On Allrecipes, find a vegan brownie recipe that has at least 40 reviews and a rating of 4.5 or higher. Include the list of ingredients, total prep and cook time, and a brief overview of the preparation steps.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--29", "ques": "Search for a Mediterranean-style grilled fish recipe on Allrecipes that includes ingredients like olives, has at least a 4-star rating, and more than 25 reviews. Detail the ingredients, cooking method, and total time required for preparation and cooking.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--30", "ques": "Find a recipe for a vegan smoothie bowl on Allrecipes that includes bananas and leaves, has more than 20 reviews, and a rating of at least 4 stars. Provide a list of ingredients, preparation time, and a summary of the recipe steps.", "web": "https://www.allrecipes.com/"} -{"web_name": "Allrecipes", "id": "Allrecipes--31", "ques": "Search for a seafood paella recipe on Allrecipes with a minimum of 4.5 stars rating and at least 50 reviews. The recipe should include shrimp and mussels. Provide the ingredients, total time, and an overview of the preparation steps.", "web": "https://www.allrecipes.com/"} +{"web_name": "Allrecipes", "id": "Allrecipes--31", "ques": "Search for a seafood paella recipe on Allrecipes with a minimum of 4.5 stars rating and at least 50 reviews. The recipe should include shrimp. Provide the ingredients, total time, and an overview of the preparation steps.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--32", "ques": "Find a high-rated beef stew recipe on Allrecipes that requires a slow cooker and has at least 30 reviews. Detail the cooking time and the first five ingredients listed in the recipe.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--33", "ques": "Find a recipe for a low-carb breakfast on Allrecipes with at least 25 reviews. Show the Nutrition Facts and the total carbohydrate content per serving.", "web": "https://www.allrecipes.com/"} {"web_name": "Allrecipes", "id": "Allrecipes--34", "ques": "Locate a baked salmon recipe on Allrecipes that has at least 50 reviews and a rating of 4.5 stars or higher. Note the primary seasoning or herb used and the estimated cooking time.", "web": "https://www.allrecipes.com/"} @@ -86,21 +86,20 @@ {"web_name": "Amazon", "id": "Amazon--40", "ques": "Locate a women's yoga mat in purple, with a thickness of at least 5mm, rated 4+ stars, and priced under $30 on Amazon. Check how many colors are available in total, and what is the return and delivery policy.", "web": "https://www.amazon.com/"} {"web_name": "Apple", "id": "Apple--0", "ques": "Compare the prices of the latest models of MacBook Air available on Apple's website.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--1", "ques": "Research the new features of the iOS 17 on Apple support and check its compatibility with the iPhone 12.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--2", "ques": "Compare the prices and chips for the iPhone 14 Pro and iPhone 15 Pro models directly from Apple's website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--2", "ques": "Compare the prices and chips for the iPhone 16 Pro and iPhone 16 models directly from Apple's website.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--3", "ques": "Find the latest model of the iPhone and compare the price and screen size between the pro and pro max.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--4", "ques": "How much does it cost to buy a Macbook pro, 16-inch, Apple M3 Max chip with 16-core CPU, 40-core GPU, 64GB unified memory, 1TB SSD.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--5", "ques": "Check the release date and price for the latest version of the iPhone.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--5", "ques": "Check price for the latest version of the iPhone.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--6", "ques": "Find AirPods on Apple and how many types are currently available.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--7", "ques": "When and where the Apple Vision Pro will be released.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--8", "ques": "Identify and list the specifications of the latest iPad model released by Apple, including its storage options, processor type, and display features.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--9", "ques": "Check the Apple Store for the availability of the latest iPhone model and schedule an in-store pickup at the nearest Apple Store for January 10, 2024.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--10", "ques": "Find information on the latest (as of today's date) MacBook model, including its key features such as processor type, memory size, and storage capacity.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--11", "ques": "Get information about the latest iPad model released by Apple, including its release date, base storage capacity, and starting price available on Apple's official website.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--12", "ques": "What Apple Repair ways are mentioned on apple website, answer 2 of them.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--11", "ques": "Get information about the latest iPad model released by Apple, including its base storage capacity, and starting price available on Apple's official website.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--12", "ques": "Find Apple Repair page and find ways to repair that are mentioned on apple website, answer 2 of them.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--13", "ques": "How many colors does the latest MacBook Air come in?", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--14", "ques": "Identify the upgrade options available for the cheapest base model of the MacBook Pro 14-inch with M3 chip, and calculate the total price difference from the base model to the maximum upgrade (no Pre-Installed Software) offered by Apple.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--15", "ques": "On Apple's website, how many different types of keyboards are available when customizing your 14-inch MacBook Pro?", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--16", "ques": "Find on Apple website how many types of AirPods (3rd generation) are available and what is the price difference.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--15", "ques": "On Apple's website, how many different language types of keyboards are available when customizing your 14-inch MacBook Pro?", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--16", "ques": "Find on Apple website how many types of AirPods (4rd generation) are available and what is the price difference.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--17", "ques": "Search Apple for the accessory Smart Folio for iPad and check the closest pickup availability next to zip code 90038.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--18", "ques": "Check if there are trade-in offers for the latest model of iPhone.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--19", "ques": "On Apple's website, what is the slogan for the Mac and what is the slogan for the Macbook pro.", "web": "https://www.apple.com/"} @@ -118,7 +117,7 @@ {"web_name": "Apple", "id": "Apple--31", "ques": "On Apple's website, what is the slogan for the latest Apple Watch Series.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--32", "ques": "Investigate the trade-in value for an iPhone 11 Pro Max on Apple's website.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--33", "ques": "Look for the color options available for the newest iMac.", "web": "https://www.apple.com/"} -{"web_name": "Apple", "id": "Apple--34", "ques": "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced.", "web": "https://www.apple.com/"} +{"web_name": "Apple", "id": "Apple--34", "ques": "Identify the size and weight for the Apple TV 4K and list the Siri Remote features introduced from the Apple TV 4K tech specs page.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--35", "ques": "How many types of Apple Pencil are currently available on the Apple's website? Which one supports Wireless pairing and charging.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--36", "ques": "Browse Apple Music on the entertainment section of the Apple's website, and see which singers' names are included in the pictures on this page.", "web": "https://www.apple.com/"} {"web_name": "Apple", "id": "Apple--37", "ques": "Compare the color options of iPhone 13 Pro, iPhone 14 Pro and iPhone 15 Pro.", "web": "https://www.apple.com/"} @@ -151,7 +150,6 @@ {"web_name": "ArXiv", "id": "ArXiv--21", "ques": "Search for papers on 'neural networks for image processing' in the Computer Science category on ArXiv and report how many were submitted in the last week.", "web": "https://arxiv.org/"} {"web_name": "ArXiv", "id": "ArXiv--22", "ques": "Locate the ArXiv Help section and find instructions on how to subscribe to daily listing emails for new submissions in a specific category.", "web": "https://arxiv.org/"} {"web_name": "ArXiv", "id": "ArXiv--23", "ques": "Determine how many articles with the keyword 'autonomous vehicles' were published in the 'Electrical Engineering and Systems Science' section of ArXiv yesterday.", "web": "https://arxiv.org/"} -{"web_name": "ArXiv", "id": "ArXiv--24", "ques": "Identify the most recent paper related to 'graph neural networks' on ArXiv and determine the affiliation of the first author.", "web": "https://arxiv.org/"} {"web_name": "ArXiv", "id": "ArXiv--25", "ques": "Browse the ArXiv store and let me know how many different types of merchandise are available.", "web": "https://arxiv.org/"} {"web_name": "ArXiv", "id": "ArXiv--26", "ques": "Search for papers related to 'climate change modeling' on ArXiv and find out how many have been published in the Earth and Planetary Astrophysics (astro-ph.EP) category in the last week.", "web": "https://arxiv.org/"} {"web_name": "ArXiv", "id": "ArXiv--27", "ques": "On ArXiv, what categories does Economics include, and what are their abbreviations?", "web": "https://arxiv.org/"} @@ -172,7 +170,7 @@ {"web_name": "ArXiv", "id": "ArXiv--42", "ques": "Find an article published between 1 January 2000 and 1 January 2005 that requires Support Vector Machines in the title and its Journey ref is ACL Workshop.", "web": "https://arxiv.org/"} {"web_name": "BBC News", "id": "BBC News--0", "ques": "Find a report on the BBC News website about recent developments in renewable energy technologies in the UK.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--1", "ques": "Read the latest health-related news article published on BBC News and summarize the key points discussed.", "web": "https://www.bbc.com/news/"} -{"web_name": "BBC News", "id": "BBC News--2", "ques": "Read the latest article regarding the environmental impacts of deforestation published within the last two days.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--2", "ques": "Read the latest article regarding the environmental impacts of deforestation published within the last two months.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--3", "ques": "Check the leaderboard for Golf's DP World Tour in the SPORT section, what was the name of the most recent tournament, and how many teams have a Total of -10 strokes.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--4", "ques": "Find the latest article regarding the economic implications of climate change in Europe as reported by BBC News and summarize the central points.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--5", "ques": "Find the article \"What is climate change? A really simple guide\" and use it to answer what human activities are causing climate change.", "web": "https://www.bbc.com/news/"} @@ -185,7 +183,7 @@ {"web_name": "BBC News", "id": "BBC News--12", "ques": "Find a picture in the travel section that contains food, tell me what the food is called and what region it comes from.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--13", "ques": "Search for recent news related to Trump and summarize the main points.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--14", "ques": "Find a news article on BBC News about the impact of the recent tech industry layoffs on the global economy. Summarize the key points and the name of the author, and provide the date of publication.", "web": "https://www.bbc.com/news/"} -{"web_name": "BBC News", "id": "BBC News--15", "ques": "What does the current headline in Natural Wonders tell about.", "web": "https://www.bbc.com/news/"} +{"web_name": "BBC News", "id": "BBC News--15", "ques": "What does the current headline in Earth -> Natural Wonders tell about.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--16", "ques": "Identify the most recent development or update in Brexit negotiations as reported on BBC News and report the key points and any stated impacts on European economies.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--17", "ques": "How many War related sections are currently in BBC News.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--18", "ques": "Visit BBC News Audio, What are the best PodCasts for 2023? List 2 of them.", "web": "https://www.bbc.com/news/"} @@ -198,7 +196,6 @@ {"web_name": "BBC News", "id": "BBC News--25", "ques": "Find the most recent sports analysis article on BBC News related to the English Premier League and summarize its key insights.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--26", "ques": "Locate the latest report on BBC News about the impact of recent natural disasters in Asia and summarize the key points and areas affected.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--27", "ques": "Find the most recent article on BBC News about archaeological discoveries and summarize the main findings and their significance.", "web": "https://www.bbc.com/news/"} -{"web_name": "BBC News", "id": "BBC News--28", "ques": "Find the Market Data section on BBC News and tell me which company the data comes from.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--29", "ques": "Visit BBC News Audio and find out which podcast episode is currently featured as the \"New Releases\".", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--30", "ques": "In the Culture section, identify the latest film release reviewed and provide a brief summary of the review.", "web": "https://www.bbc.com/news/"} {"web_name": "BBC News", "id": "BBC News--31", "ques": "Check the Sports section for the result of the most recent Manchester United football match.", "web": "https://www.bbc.com/news/"} @@ -232,7 +229,7 @@ {"web_name": "Booking", "id": "Booking--17", "ques": "Find a hotel in Paris with a fitness center and a rating of 8 or higher available for a 5-night stay starting from February 14, 2024, and sort the results by best reviewed.", "web": "https://www.booking.com/"} {"web_name": "Booking", "id": "Booking--18", "ques": "Search a hotel in London with a user rating of 8 or higher for a stay between February 14th, 2024, and February 21st, 2024, suitable for a couple. Provide the name and a short description of the hotel.", "web": "https://www.booking.com/"} {"web_name": "Booking", "id": "Booking--19", "ques": "Look for a hotel with customer ratings above an 8.0 in Paris, France for a weekend stay from March 18, 2024, to March 20, 2024, and list top three suggestions based on user reviews.", "web": "https://www.booking.com/"} -{"web_name": "Booking", "id": "Booking--20", "ques": "Locate a hotel in Rome with a good rating (7 or above) that offers free cancellation and breakfast included, for a three-night stay from February 28 to March 2, 2024, for two adults.", "web": "https://www.booking.com/"} +{"web_name": "Booking", "id": "Booking--20", "ques": "Locate a hotel in Rome with a good rating (7 or above) that offers free cancellation and breakfast included, for a three-night stay from February 15 to February 18, 2025, for two adults.", "web": "https://www.booking.com/"} {"web_name": "Booking", "id": "Booking--21", "ques": "Find a hotel in Sydney with a rating of 8 or higher, providing free Wi-Fi and parking, available for a four-night stay starting on March 10, 2024.", "web": "https://www.booking.com/"} {"web_name": "Booking", "id": "Booking--22", "ques": "Search for a hotel in Amsterdam with a customer review score of 9 or higher, offering bicycle rentals, for a week-long stay from March 15 to March 22, 2024, for two adults.", "web": "https://www.booking.com/"} {"web_name": "Booking", "id": "Booking--23", "ques": "Identify a hotel in Tokyo with a spa and wellness center, rated 9 or above, with availability for a five-night stay starting on February 20, 2024. Check if free cancellation is offered.", "web": "https://www.booking.com/"} @@ -316,7 +313,7 @@ {"web_name": "Coursera", "id": "Coursera--14", "ques": "Find a course on Coursera related to Introductory Project Management that includes modules on Agile methodology.", "web": "https://www.coursera.org/"} {"web_name": "Coursera", "id": "Coursera--15", "ques": "Find a course on Coursera named 'Introduction to Mathematical Thinking' offered by Stanford, what is the percentage (rounded) of 5 star ratings in reviews and which level has the least percentage?.", "web": "https://www.coursera.org/"} {"web_name": "Coursera", "id": "Coursera--16", "ques": "Identify a course on Coursera named 'Introduction to Finance: The Basics', who is the course instructor and what other courses does he/she teach.", "web": "https://www.coursera.org/"} -{"web_name": "Coursera", "id": "Coursera--17", "ques": "How many results are there for a search on Coursera for Machine Learning, then filtered by Credit Eligible and 1-4 Years duration?", "web": "https://www.coursera.org/"} +{"web_name": "Coursera", "id": "Coursera--17", "ques": "How many results are there for a search on Coursera for Machine Learning, then filtered by Degrees and 1-4 Years duration?", "web": "https://www.coursera.org/"} {"web_name": "Coursera", "id": "Coursera--18", "ques": "Identify a Coursera course that teaches JavaScript, which is beginner-friendly and includes a certificate upon completion.", "web": "https://www.coursera.org/"} {"web_name": "Coursera", "id": "Coursera--19", "ques": "Identify a course on Coursera that provides an introduction to Psychology, list the instructor's name, the institution offering it, and how many hours it will approximately take to complete.", "web": "https://www.coursera.org/"} {"web_name": "Coursera", "id": "Coursera--20", "ques": "Find an Intermediate-level online course on Coursera about 'Blockchain Technology' which lasts between 1 to 4 weeks, and is provided by a well-known institution. Also, note the course's main goals and the instructor's name.", "web": "https://www.coursera.org/"} @@ -356,11 +353,11 @@ {"web_name": "ESPN", "id": "ESPN--12", "ques": "The first three Top Headlines in the current ESPN home page correspond to which sports leagues?", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--13", "ques": "Identify today's top headline in the Basketball section of ESPN, and summarize the main points of that article.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--14", "ques": "Find the latest news about NBA trades or player movements on ESPN and report the most recent trade deal OR player acquisition.", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--15", "ques": "Check the scores of the NBA games played on December 25, 2023.", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--16", "ques": "Check the schedule for the NBA game on December 25, 2023, and provide the teams that are playing and their current standings in their respective conferences.", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--17", "ques": "Check out the NBA Basketball Power Index 2023-24 to see which teams are in first place and which are in last place.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--15", "ques": "Check the scores of the NBA games played on December 25, 2024.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--16", "ques": "Check the schedule for the NBA game on December 25, 2024, and provide the teams that are playing and their current standings in their respective conferences.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--17", "ques": "Check out the NBA Basketball Power Index 2024-25 to see which teams are in first place and which are in last place.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--18", "ques": "How many sports leagues can you choose from on the ESPN home page?", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--19", "ques": "Who has the highest salary in Boston Celtics Roster 2023-24?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--19", "ques": "Who has the highest salary in Boston Celtics Roster 2024-25?", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--20", "ques": "Look up the current leaders in rebounds and assists in the NBA Western Conference on ESPN.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--21", "ques": "Show the scores and main highlight of the Denver Nuggets game that occurred within the last 3 days on ESPN.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--22", "ques": "Find the latest Team transactions in the NBA within the past week.", "web": "https://www.espn.com/"} @@ -371,8 +368,8 @@ {"web_name": "ESPN", "id": "ESPN--27", "ques": "Search on ESPN for how many teams have 'Golden' in their name and how many of them are in the NHL.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--28", "ques": "How many MLB teams are there and list all the teams with 'City' in their name.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--29", "ques": "Identify today's top headline in the Soccer section of ESPN, and summarize the main points of that article.", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--30", "ques": "Check out the NHL Standings 2023-24 on ESPN to see which teams are at the top and which are at the bottom in Eastern and Western Conference. What about the situation in Division.", "web": "https://www.espn.com/"} -{"web_name": "ESPN", "id": "ESPN--31", "ques": "Who has the heaviest weight among infielders in the New York Yankees Roster 2023-24?", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--30", "ques": "Check out the NHL Standings 2024-25 on ESPN to see which teams are at the top and which are at the bottom in Eastern and Western Conference. What about the situation in Division.", "web": "https://www.espn.com/"} +{"web_name": "ESPN", "id": "ESPN--31", "ques": "Who has the heaviest weight among infielders in the New York Yankees Roster 2024-25?", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--32", "ques": "Review yesterday's NHL game results on ESPN, focusing on teams' performance.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--33", "ques": "Locate the latest ESPN articles discussing potential MVP candidates in the NFL for 2023 season.", "web": "https://www.espn.com/"} {"web_name": "ESPN", "id": "ESPN--34", "ques": "Visit ESPN to view the Philadelphia 76ers' latest injuries.", "web": "https://www.espn.com/"} @@ -405,7 +402,6 @@ {"web_name": "GitHub", "id": "GitHub--17", "ques": "Locate a C++ project on GitHub that has been recently updated in the last week and has at least 500 stars, then describe its main purpose.", "web": "https://github.com/"} {"web_name": "GitHub", "id": "GitHub--18", "ques": "Identify and report the most popular (in terms of stars) open-source image processing tool on GitHub.", "web": "https://github.com/"} {"web_name": "GitHub", "id": "GitHub--19", "ques": "Look up the most recently updated Python repository on GitHub that is tagged with 'web scraping' and has over 100 stars.", "web": "https://github.com/"} -{"web_name": "GitHub", "id": "GitHub--20", "ques": "Open GitHub Copilot's FAQs to find the official answer to when Copilot chat can be used on mobile.", "web": "https://github.com/"} {"web_name": "GitHub", "id": "GitHub--21", "ques": "Find the Security topic in GitHub Resources and answer the role of GitHub Advanced Security.", "web": "https://github.com/"} {"web_name": "GitHub", "id": "GitHub--22", "ques": "Find an open-source repository on GitHub focused on natural language processing in Ruby, updated within the last week.", "web": "https://github.com/"} {"web_name": "GitHub", "id": "GitHub--23", "ques": "Find the wiki page of ohmyzsh on GitHub and tell me how to change the theme of zsh to agnoster.", "web": "https://github.com/"} @@ -562,7 +558,6 @@ {"web_name": "Huggingface", "id": "Huggingface--7", "ques": "Which is the most downloaded audio related dataset on Hugging face currently.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--8", "ques": "Retrieve an example of a pre-trained language model in natural language processing and identify the tasks it is specifically designed for, like translation or text summarization.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--9", "ques": "Find the most download machine translation model on Huggingface which focuses on English and Japanese (en-ja) and report the evaluation metrics stated for it.", "web": "https://huggingface.co/"} -{"web_name": "Huggingface", "id": "Huggingface--10", "ques": "Open space: argilla/notux-chat-ui and interact with it by asking it 'which team trained you'. What is its answer.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--11", "ques": "Identify the latest updated image to video model available on Huggingface and summarize its main features.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--12", "ques": "Find the most recently updated machine learning model on Huggingface which focuses on Error Correction.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--13", "ques": "Search for LLaMA in the huggingface doc, what type is the spaces_between_special_tokens parameter in LlamaTokenizer and what is its default value.", "web": "https://huggingface.co/"} @@ -572,7 +567,6 @@ {"web_name": "Huggingface", "id": "Huggingface--17", "ques": "Find the most recently updated open-source project related to natural language processing on the Huggingface platform. Provide the project's name, creator, and a brief description of its functionality.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--18", "ques": "Look up TRL's forward modelling in the hugging face documentation on how to add a margin to a loss.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--19", "ques": "Explore and summarize the features of the most recent open-source NLP model released by Hugging Face for English text summarization.", "web": "https://huggingface.co/"} -{"web_name": "Huggingface", "id": "Huggingface--20", "ques": "Locate a pre-trained natural language processing model on Hugging Face that specializes in named entity recognition (NER), confirm that the model was last updated in 2022 and has 1M+ downloads.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--21", "ques": "Look up the tour about how to use the 'pipeline' feature in the Hugging Face Transformers library for sentiment analysis, and identify the default model it uses.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--22", "ques": "Identify the steps to convert a PyTorch model to TensorFlow using the Hugging Face Transformers library as described in their documentation.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--23", "ques": "Identify three innovative and widely recognized open-source NLP models for automatic speech recognition released in the past month on Huggingface.", "web": "https://huggingface.co/"} @@ -586,7 +580,6 @@ {"web_name": "Huggingface", "id": "Huggingface--31", "ques": "Identify the latest machine learning model on Huggingface that specializes in detecting fake news, including the date of its last update.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--32", "ques": "On the Hugging Face website, search for the model 'GPT-J-6B' and find the 'temperature' parameter in its settings. What is the default value of this parameter?", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--33", "ques": "List three hugging face docs. How many GitHub stars have they earned so far?", "web": "https://huggingface.co/"} -{"web_name": "Huggingface", "id": "Huggingface--34", "ques": "List the benefits of hugging face classroom mentioned on Hugging face website.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--35", "ques": "Find the latest Diffusion-related blog on Hugging Face, and read its intro or overview section to roughly summarize the content of the blog.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--36", "ques": "Summarize all the payment plans and their advantages in huggingface pricing.", "web": "https://huggingface.co/"} {"web_name": "Huggingface", "id": "Huggingface--37", "ques": "Browse the daily paper on Hugging Face. What is the title of the first article, how many upvotes has it received, and is there any related model or data release?", "web": "https://huggingface.co/"} @@ -639,5 +632,4 @@ {"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--41", "ques": "What is the approximate Heart Rate Reserve of a 50 year old man who has a heart rate of 60bpm at rest.", "web": "https://www.wolframalpha.com/"} {"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--42", "ques": "What is the raw memory of a 100.2\" * 123.5\" true colour picture at 72 ppi?", "web": "https://www.wolframalpha.com/"} {"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--43", "ques": "A polyominoes of order 6 means you have 6 identical squares to combine different shapes (2-sided). How many combinations are there? Looking at all the shapes in the result, how many of them have only 2 rows in total?", "web": "https://www.wolframalpha.com/"} -{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--44", "ques": "Solve the ODE, g' + cos(g) = 0, if there is a constant in the result, determine the value of the constant by the condition that g(0) = 1.", "web": "https://www.wolframalpha.com/"} -{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--45", "ques": "A 175cm tall, 85kg, 40yo man climbs 2500 steps at about 18cm per step and 40 steps per minute. summarise the Metabolic properties.", "web": "https://www.wolframalpha.com/"} \ No newline at end of file +{"web_name": "Wolfram Alpha", "id": "Wolfram Alpha--44", "ques": "Solve the ODE, g' + cos(g) = 0, if there is a constant in the result, determine the value of the constant by the condition that g(0) = 1.", "web": "https://www.wolframalpha.com/"} \ No newline at end of file