1242. Web Crawler Multithreaded

LeetCode medium original: C# #array #csharp #graph #hash-table #leetcode #medium #string

Der Aufgabentext wird für die gewählte Sprache aus dem Russischen übersetzt. Code bleibt unverändert.

LeetCode Static

given URL startUrl и интерфейс HtmlParser, реализуйте многопоточный веб-краулер, который будет просматривать все ссылки, находящиеся под тем же именем хоста, что и startUrl. return все URL, полученные вашим веб-краулером, в любом порядке.

Ваш краулер должен: Начинать со страницы: startUrl Вызывать HtmlParser.getUrls(url), чтобы получить все URL с веб-страницы данного URL. Не просматривать одну и ту же ссылку дважды. Исследовать только те ссылки, которые находятся под тем же именем хоста, что и startUrl.

Beispiel:

Input:

urls = [

"http://news.yahoo.com",

"http://news.yahoo.com/news",

"http://news.yahoo.com/news/topics/",

"http://news.google.com",

"http://news.yahoo.com/us"

]

edges = [[2,0],[2,1],[3,2],[3,1],[0,4]]

startUrl = "http://news.yahoo.com/news/topics/"

Output: [

"http://news.yahoo.com",

"http://news.yahoo.com/news",

"http://news.yahoo.com/news/topics/",

"http://news.yahoo.com/us"

]

C# Lösung

zugeordnet/original

using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Net;
using System.Threading.Tasks;
public class HtmlParser {
    public List<string> GetUrls(string url) {
        return new List<string>();
    }
}
public class Solution {
    private ConcurrentDictionary<string, bool> visited = new ConcurrentDictionary<string, bool>();
    private string hostname;
    private HtmlParser htmlParser;
    public IList<string> Crawl(string startUrl, HtmlParser htmlParser) {
        this.hostname = new Uri(startUrl).Host;
        this.htmlParser = htmlParser;
        var tasks = new List<Task>();
        visited.TryAdd(startUrl, true);
        tasks.Add(Task.Run(() => Visit(startUrl)));
        Task.WaitAll(tasks.ToArray());
        return visited.Keys.ToList();
    }
    private async Task Visit(string url) {
        foreach (var nextUrl in htmlParser.GetUrls(url)) {
            if (new Uri(nextUrl).Host == hostname && visited.TryAdd(nextUrl, true)) {
                await Visit(nextUrl);
            }
        }
    }
}

C++ Lösung

Auto-Entwurf, vor dem Einreichen prüfen

#include <bits/stdc++.h>
using namespace std;

// Auto-generated C++ draft from the C# solution. Review containers, LINQ and helper types before submit.
public class HtmlParser {
    public List<string> GetUrls(string url) {
        return new List<string>();
    }
}
class Solution {
public:
    private ConcurrentDictionary<string, bool> visited = new ConcurrentDictionary<string, bool>();
    private string hostname;
    private HtmlParser htmlParser;
    public vector<string> Crawl(string startUrl, HtmlParser htmlParser) {
        this.hostname = new Uri(startUrl).Host;
        this.htmlParser = htmlParser;
        var tasks = new List<Task>();
        visited.TryAdd(startUrl, true);
        tasks.push_back(Task.Run(() => Visit(startUrl)));
        Task.WaitAll(tasks.ToArray());
        return visited.Keys.ToList();
    }
    private async Task Visit(string url) {
        foreach (var nextUrl in htmlParser.GetUrls(url)) {
            if (new Uri(nextUrl).Host == hostname && visited.TryAdd(nextUrl, true)) {
                await Visit(nextUrl);
            }
        }
    }
}

Java Lösung

zugeordnet/original

import java.net.*;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicBoolean;

class HtmlParser {
    public List<String> getUrls(String url) {
        return new ArrayList<>();
    }
}

public class Solution {
    public List<String> crawl(String startUrl, HtmlParser htmlParser) {
        String hostname = extractHostname(startUrl);
        Set<String> visited = ConcurrentHashMap.newKeySet();
        ExecutorService executor = Executors.newFixedThreadPool(10);
        Queue<Future<?>> futures = new ConcurrentLinkedQueue<>();

        visited.add(startUrl);
        futures.add(executor.submit(() -> visit(startUrl, htmlParser, hostname, visited, futures, executor)));

        while (!futures.isEmpty()) {
            try {
                futures.poll().get();
            } catch (InterruptedException | ExecutionException e) {
                e.printStackTrace();
            }
        }

        executor.shutdown();
        return new ArrayList<>(visited);
    }

    private void visit(String url, HtmlParser htmlParser, String hostname, Set<String> visited, Queue<Future<?>> futures, ExecutorService executor) {
        for (String nextUrl : htmlParser.getUrls(url)) {
            if (extractHostname(nextUrl).equals(hostname) && visited.add(nextUrl)) {
                futures.add(executor.submit(() -> visit(nextUrl, htmlParser, hostname, visited, futures, executor)));
            }
        }
    }

    private String extractHostname(String url) {
        try {
            URL u = new URL(url);
            return u.getHost();
        } catch (MalformedURLException e) {
            e.printStackTrace();
            return "";
        }
    }
}

JavaScript Lösung

zugeordnet/original

class HtmlParser {
    getUrls(url) {
        return [];
    }
}

const extractHostname = (url) => {
    const { hostname } = new URL(url);
    return hostname;
};

const crawl = async (startUrl, htmlParser) => {
    const hostname = extractHostname(startUrl);
    const visited = new Set([startUrl]);
    const queue = [startUrl];

    while (queue.length) {
        const url = queue.shift();
        const urls = await htmlParser.getUrls(url);

        for (const nextUrl of urls) {
            if (extractHostname(nextUrl) === hostname && !visited.has(nextUrl)) {
                visited.add(nextUrl);
                queue.push(nextUrl);
            }
        }
    }

    return Array.from(visited);
};

Python Lösung

zugeordnet/original

from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse
import threading

class HtmlParser:
    def getUrls(self, url: str) -> [str]:
        pass

def extract_hostname(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc

def crawl(startUrl, htmlParser):
    hostname = extract_hostname(startUrl)
    visited = set()
    lock = threading.Lock()

    def visit(url):
        if extract_hostname(url) == hostname:
            with lock:
                if url not in visited:
                    visited.add(url)
                    for next_url in htmlParser.getUrls(url):
                        executor.submit(visit, next_url)

    with ThreadPoolExecutor(max_workers=10) as executor:
        executor.submit(visit, startUrl)

    return list(visited)

Go Lösung

zugeordnet/original

package main

import (
    "net/url"
    "sync"
)

type HtmlParser struct{}

func (p *HtmlParser) GetUrls(url string) []string {
    return []string{}
}

func extractHostname(rawurl string) string {
    parsedUrl, _ := url.Parse(rawurl)
    return parsedUrl.Host
}

func crawl(startUrl string, htmlParser HtmlParser) []string {
    hostname := extractHostname(startUrl)
    visited := sync.Map{}
    var wg sync.WaitGroup
    queue := make(chan string, 100)
    visited.Store(startUrl, true)
    queue <- startUrl

    worker := func() {
        for url := range queue {
            for _, nextUrl := range htmlParser.GetUrls(url) {
                if extractHostname(nextUrl) == hostname {
                    if _, loaded := visited.LoadOrStore(nextUrl, true); !loaded {
                        wg.Add(1)
                        queue <- nextUrl
                    }
                }
            }
            wg.Done()
        }
    }

    for i := 0; i < 10; i++ {
        go worker()
    }
    wg.Add(1)
    wg.Wait()
    close(queue)

    var result []string
    visited.Range(func(key, value interface{}) bool {
        result = append(result, key.(string))
        return true
    })
    return result
}

func main() {}

Algorithm

Извлечь имя хоста из startUrl.

Использовать многопоточность для обработки URL-адресов.

Хранить посещенные URL-адреса, чтобы избежать повторного посещения.

Использовать HtmlParser для получения URL-адресов с веб-страниц.

😎

Stellen zu dieser Aufgabe

aktive Stellen with overlapping task tags are angezeigt.

Alle Stellen

Es gibt noch keine aktiven Stellen.