Skip to content
This repository has been archived by the owner on Mar 23, 2024. It is now read-only.

Feat/#42 #47

Merged
merged 7 commits into from
Feb 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Epub/KoeBook.Epub/ScrapingAozora.cs
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath,
var response = await downloading.Task.ConfigureAwait(false);
using var ms = new MemoryStream();
await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false);
var filePass = imageDirectory + FileUrlToFileName().Replace(img.Source, "$1");
var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(img.Source, "$1"));
File.WriteAllBytes(filePass, ms.ToArray());
checkSection(document, chapterNum);
if (document.Chapters[chapterNum].Sections[sectionNum].Elements.Count > 1)
Expand Down
298 changes: 298 additions & 0 deletions Epub/KoeBook.Epub/ScrapingNarou.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
using AngleSharp;
using AngleSharp.Dom;
using AngleSharp.Html.Dom;
using AngleSharp.Io;
using KoeBook.Epub.Service;
using System.IO;
using System.Net.Http.Json;
using static KoeBook.Epub.ScrapingHelper;

namespace KoeBook.Epub
{
public partial class ScrapingNarouService : IScrapingService
{
public ScrapingNarouService(IHttpClientFactory httpClientFactory)
{
_httpCliantFactory = httpClientFactory;
}

private readonly IHttpClientFactory _httpCliantFactory;

public async Task<EpubDocument> ScrapingAsync(string url, string coverFilePath, string imageDirectory, Guid id, CancellationToken ct)
{
var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);

// title の取得
var bookTitle = doc.QuerySelector(".novel_title");
if (bookTitle is null)
{
throw new EpubDocumentException($"Failed to get title properly.\nUrl may be not collect");
}

// auther の取得
var bookAuther = doc.QuerySelector(".novel_writername a");
if (bookAuther is null)
{
throw new EpubDocumentException($"Failed to get auther properly.\nUrl may be not collect");
}

bool isRensai = true;
int allNum = 0;

var apiUrl = $"https://api.syosetu.com/novelapi/api/?of=ga-nt&ncode={UrlToNcode().Replace(url, "$1")}&out=json";

// APIを利用して、noveltype : 連載(1)か短編(2)か、general_all_no : 全掲載部分数
var message = new HttpRequestMessage(System.Net.Http.HttpMethod.Get, apiUrl);
message.Headers.UserAgent.ParseAdd("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36");
var client = _httpCliantFactory.CreateClient();
var result = await client.SendAsync(message, ct).ConfigureAwait(false);
var test = await result.Content.ReadAsStringAsync(ct).ConfigureAwait(false);
if (result.IsSuccessStatusCode)
{
var content = await result.Content.ReadFromJsonAsync<BookInfo[]>(ct).ConfigureAwait(false);
if (content != null)
{
if (content[1].noveltype != null)
{
if (content[1].noveltype == 2)
{
isRensai = false;
}
}
else
{
throw new EpubDocumentException("faild to get data by Narou API");
}
if (content[1].general_all_no != null)
{
allNum = (int)content[1].general_all_no!;
}
if (allNum == 0)
{
throw new EpubDocumentException("faild to get data by Narou API");
}
}
}
else
{
throw new EpubDocumentException("Url may be not Correct");
}

var document = new EpubDocument(bookTitle.InnerHtml, bookAuther.InnerHtml, coverFilePath, id);
if (isRensai)
{
List<SectionWithChapterTitle> SectionWithChapterTitleList = new List<SectionWithChapterTitle>();
for (int i = 1; i <= allNum; i++)
{
Console.WriteLine(i);
await Task.Delay(500);
var pageUrl = System.IO.Path.Combine(url, i.ToString());
var load = await ReadPageAsync(pageUrl, isRensai, imageDirectory, ct).ConfigureAwait(false);
SectionWithChapterTitleList.Add(load);
}
string? chapterTitle = null;
foreach (var sectionWithChapterTitle in SectionWithChapterTitleList)
{
if (sectionWithChapterTitle != null)
{
if (sectionWithChapterTitle.title != null)
{
if (sectionWithChapterTitle.title != chapterTitle)
{
chapterTitle = sectionWithChapterTitle.title;
document.Chapters.Add(new Chapter() { Title = chapterTitle });
document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section);
}
else
{
document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section);
}
}
else
{
if (document.Chapters.Count == 0)
{
document.Chapters.Add(new Chapter());
}
document.Chapters[^1].Sections.Add(sectionWithChapterTitle.section);
}
}
else
{
throw new EpubDocumentException("failed to get page");
}
}
}
else
{
var load = await ReadPageAsync(url, isRensai, imageDirectory, ct).ConfigureAwait(false);
if (load != null)
{
document.Chapters.Add(new Chapter() { Title = null });
document.Chapters[^1].Sections.Add(load.section);
}
}
return document;
}

public record BookInfo(int? allcount, int? noveltype, int? general_all_no);

private record SectionWithChapterTitle(string? title, Section section);

private async Task<SectionWithChapterTitle> ReadPageAsync(string url, bool isRensai, string imageDirectory, CancellationToken ct)
{
var config = Configuration.Default.WithDefaultLoader();
using var context = BrowsingContext.New(config);
var doc = await context.OpenAsync(url, ct).ConfigureAwait(false);

var chapterTitleElement = doc.QuerySelector(".chapter_title");
string? chapterTitle = null;
if (chapterTitleElement != null)
{
if (chapterTitleElement.InnerHtml != null)
{
chapterTitle = chapterTitleElement.InnerHtml;
}
}

IElement? sectionTitleElement = null;
if (isRensai)
{
sectionTitleElement = doc.QuerySelector(".novel_subtitle");
}
else
{
sectionTitleElement = doc.QuerySelector(".novel_title");
}

string sectionTitle = "";
if (sectionTitleElement == null)
{
throw new EpubDocumentException("Can not find title of page");
}
else
{
sectionTitle = sectionTitleElement.InnerHtml;
}


var section = new Section(sectionTitleElement.InnerHtml);


var main_text = doc.QuerySelector("#novel_honbun");
if (main_text != null)
{
foreach (var item in main_text.Children)
{
if (item is IHtmlParagraphElement)
{
if (item.ChildElementCount == 0)
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
}
else if (item.ChildElementCount == 1)
{
if (item.Children[0] is IHtmlAnchorElement aElement)
{
if (aElement.ChildElementCount == 1)
{
if (aElement.Children[0] is IHtmlImageElement img)
{
if (img.Source != null)
{
// 画像のダウンロード
var loader = context.GetService<IDocumentLoader>();
if (loader != null)
{
var downloading = loader.FetchAsync(new DocumentRequest(new Url(img.Source)));
ct.Register(() => downloading.Cancel());
var response = await downloading.Task.ConfigureAwait(false);
using var ms = new MemoryStream();
await response.Content.CopyToAsync(ms, ct).ConfigureAwait(false);
var filePass = System.IO.Path.Combine(imageDirectory, FileUrlToFileName().Replace(response.Address.Href, "$1"));
File.WriteAllBytes(filePass, ms.ToArray());
section.Elements.Add(new Picture(filePass));
}
}
else
{
throw new EpubDocumentException("Unexpected structure");
}
}
}
else
{
throw new EpubDocumentException("Unexpected structure");
}
}
else if (item.Children[0].TagName == "RUBY")
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
}
else if (item.Children[0] is not IHtmlBreakRowElement)
{
throw new EpubDocumentException("Unexpected structure");
}
}
else
{
bool isAllRuby = true;
foreach (var tags in item.Children)
{
if (tags.TagName != "RUBY")
{
isAllRuby = false;
}
}
if (isAllRuby)
{
if (!string.IsNullOrWhiteSpace(item.InnerHtml))
{
foreach (var split in SplitBrace(item.InnerHtml))
{
section.Elements.Add(new Paragraph() { Text = split });
}
}
}
else
{
throw new EpubDocumentException("Unexpected structure");
}
}
}
else
{
throw new EpubDocumentException("Unexpected structure");
}
}
}
else
{
throw new EpubDocumentException("There is no honbun.");
}
return new SectionWithChapterTitle(chapterTitle, section);
}



[System.Text.RegularExpressions.GeneratedRegex(@"https://.{5,7}.syosetu.com/(.{7}).?")]
private static partial System.Text.RegularExpressions.Regex UrlToNcode();

[System.Text.RegularExpressions.GeneratedRegex(@"http.{1,}/([^/]{0,}\.[^/]{1,})")]
private static partial System.Text.RegularExpressions.Regex FileUrlToFileName();
}
}
Loading