Skip to content
This repository has been archived by the owner on Aug 14, 2023. It is now read-only.

Commit

Permalink
Enhanced the Crawler.kt using Kotlin Coroutine Channels
Browse files Browse the repository at this point in the history
  • Loading branch information
yamin8000 committed Apr 19, 2023
1 parent 44f9c93 commit 331448d
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 57 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ Alternatively, you can download the jar file from the releases section or build
## Features

- Poor man's Crawler
- Search with trigger
- User tweets limit
- Depth limit
- User info
- More features (WIP)

## License
Expand Down
2 changes: 1 addition & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ plugins {
}

group = "io.github.yamin8000"
version = "1.0.0"
version = "1.0.1"

repositories {
mavenCentral()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ object ConsoleHelper {
*/
fun readBoolean(message: String? = null): Boolean {
return try {
if (message != null) t.println(askStyle(message))
if (message != null) t.println(askStyle("${message}(y/n)"))
readCleanLine().lowercase(Locale.getDefault()) in affirmatives
} catch (exception: Exception) {
false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,94 +9,111 @@ import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.resultStyle
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.t
import io.github.yamin8000.twitterscrapper.helpers.ConsoleHelper.warningStyle
import io.github.yamin8000.twitterscrapper.util.Constants
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_DEPTH_LIMIT
import io.github.yamin8000.twitterscrapper.util.Constants.DEFAULT_CRAWL_TWEETS_LIMIT
import io.github.yamin8000.twitterscrapper.util.Constants.PROTECTED_ACCOUNT
import io.github.yamin8000.twitterscrapper.util.KTree
import io.github.yamin8000.twitterscrapper.util.Utility.csvOf
import io.github.yamin8000.twitterscrapper.web.retryingGet
import kotlinx.coroutines.*
import kotlinx.coroutines.channels.Channel
import org.jsoup.Jsoup
import org.jsoup.select.Elements
import java.io.File

class Crawler(
private val isNested: Boolean = true,
isNested: Boolean = true
) {
private val scope = CoroutineScope(Dispatchers.IO)

private var startingUsers = listOf<String>()
private val channel = Channel<Job>(4)

private val startingUsers: List<String>

private var tweetCountLimit = DEFAULT_CRAWL_TWEETS_LIMIT

private var triggers: List<String> = listOf()
private var triggers = listOf<String>()

private var root: KTree<String>? = null

private var depthLimit = DEFAULT_CRAWL_DEPTH_LIMIT

init {
startingUsers = readMultipleStrings("Starting user").map { it.sanitizeUser() }
if (readBoolean("Do you want to limit the number of tweets for each user?(y/n)")) {
tweetCountLimit = readInteger(
message = "Enter tweet limit for each user.",
range = 1..DEFAULT_CRAWL_TWEETS_LIMIT
)
if (readBoolean("Do you want to customize the crawler?")) {
if (readBoolean("Do you want to limit the number of tweets for each user?")) {
tweetCountLimit = readInteger(
message = "Enter tweet limit for each user.",
range = 1..DEFAULT_CRAWL_TWEETS_LIMIT
)
}
if (isNested) {
if (readBoolean("Do you want to specify crawl depth limit?"))
depthLimit = readInteger("Crawl depth limit")
} else depthLimit = 1
if (readBoolean("Do you want to filter tweets with Trigger words?"))
triggers = readMultipleStrings("Trigger word")
}
if (readBoolean("Do you want to filter tweets with Trigger words?(y/n)"))
triggers = readMultipleStrings("Trigger word")
}

@OptIn(ExperimentalCoroutinesApi::class)
suspend fun crawl() {
buildList {
startingUsers.forEach { user ->
add(scope.launch { singleUserCrawler(user) })
startingUsers.forEach { user ->
channel.send(singleUserCrawler(user))
}
while (true) {
if (channel.isEmpty) {
t.println(resultStyle("Crawler Stopped!"))
break
}
}.joinAll()
delay(5000)
}
}

private suspend fun singleUserCrawler(
username: String
) {
): Job = scope.launch {
if (root == null) root = KTree(username)

t.println(infoStyle("Crawling: ") + resultStyle(username))

if (!File("${Constants.DOWNLOAD_PATH}/$username.txt").exists()) {
var tweetCount = 0
try {
crawlUsername(username) { elements ->
t.println(infoStyle("New results for $username"))
val tweets = getSingles(elements)
val tweetsWithTriggers = mutableListOf<String>()
triggers.forEach { trigger ->
tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
}

var newTweets = tweets
var newTweetsCount = newTweets.size
if (triggers.isNotEmpty()) {
newTweets = tweetsWithTriggers
newTweetsCount = tweetsWithTriggers.size
}
val (tweets, friends) = crawlUsername(username, tweetCountLimit)
t.println(infoStyle("New results for $username"))
val tweetsWithTriggers = mutableListOf<String>()
triggers.forEach { trigger ->
tweetsWithTriggers.addAll(tweets.filter { it.contains(trigger) })
}

if (newTweets.isNotEmpty())
saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())
else t.println(warningStyle("Empty tweets for $username"))
val newTweets = if (triggers.isNotEmpty()) tweetsWithTriggers else tweets

tweetCount += newTweetsCount
if (tweetCount >= tweetCountLimit) throw Exception("Tweet count limit reached for $username")
val node = root?.findDescendant(username) ?: root
t.println(infoStyle("$username, Tree level: ${node?.level}"))
if (tweets.isNotEmpty() && node != null && node.level <= depthLimit) {
if (newTweets.isNotEmpty())
saveUserPosts(username, newTweets.take(tweetCountLimit).toSet())

val friends = fetchNewUsers(elements.html())
.map { it.sanitizeUser() }
.filter { it != username }
if (isNested)
friends.forEach { scope.launch { singleUserCrawler(it) } }
friends.forEach {
node.addChild(it)
}
} catch (e: Exception) {
t.println(errorStyle(e.message ?: ""))
}
if (depthLimit >= 1) {
node.children().filter { it.level <= depthLimit }.forEach {
channel.send(singleUserCrawler(it.data))
}
}
} else t.println(warningStyle("Empty tweets for $username"))
} else t.println(warningStyle("$username is already being crawled"))
channel.receive()
}

private suspend fun crawlUsername(
username: String,
onNewElements: (Elements) -> Unit
) {
limit: Int
): Pair<List<String>, List<String>> {
var cursor: String? = ""
var html: String
val tweets = mutableSetOf<String>()
val friends = mutableSetOf<String>()
do {
html = withContext(scope.coroutineContext) { retryingGet("$username?cursor=$cursor")?.body?.string() ?: "" }
if (html.contains(PROTECTED_ACCOUNT)) {
Expand All @@ -113,8 +130,15 @@ class Crawler(
?.attr("href")
?.split('=')
?.last()
onNewElements(doc.allElements)
tweets.addAll(getSingles(doc.allElements))
friends.addAll(
fetchNewUsers(html)
.map { it.sanitizeUser() }
.filter { it != username }
)
if (tweets.take(limit).size >= limit) break
} while (cursor != null)
return tweets.take(limit) to friends.toList()
}

private fun getSingles(
Expand All @@ -130,15 +154,9 @@ class Crawler(
t.println(infoStyle("Saving $username tweets"))
val file = File("${Constants.DOWNLOAD_PATH}/$username.txt")

var bias = 0
var headers: List<String>? = listOf("#", "tweet")
if (file.exists()) {
bias = file.readText().split("\n").size
headers = null
}
val headers = listOf("#", "tweet")

val csv = csvOf(
indexBias = bias,
headers = headers,
data = tweets,
itemBuilder = { index, item ->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class CrawlerModule : BaseModule(Menus.crawlerMenu) {
) {
val crawler = Crawler(isNested)
runBlocking {
withContext(Dispatchers.Default) {
withContext(Dispatchers.Unconfined) {
crawler.crawl()
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ object Constants {
var DOWNLOAD_PATH = "c:\\TwitterScrapper"

const val DEFAULT_CRAWL_TWEETS_LIMIT = 500
const val DEFAULT_CRAWL_DEPTH_LIMIT = 3

val FAILED_REQUEST_DELAY = 50L..500L

Expand Down
44 changes: 44 additions & 0 deletions src/main/kotlin/io/github/yamin8000/twitterscrapper/util/KTree.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package io.github.yamin8000.twitterscrapper.util

class KTree<T>(root : T) {
var parent: KTree<T>? = null
get() = field
var data: T = root
get() = field

val isRoot: Boolean
get() = parent == null

val isLeaf: Boolean
get() = directChildren.isEmpty()

val level: Int
get() = if (isRoot) 0 else (parent?.level ?: 0) + 1

private var directChildren = mutableListOf<KTree<T>>()

private var descendants = mutableListOf<KTree<T>>()

fun children() = directChildren.toList()

fun root(): KTree<T> = if (this.parent == null) this else this.root()

fun addChild(child: T): KTree<T> {
val childNode = KTree(child)
childNode.parent = this
directChildren.add(childNode)
addDescendant(childNode)
parent?.addDescendant(childNode)
return childNode
}

private fun addDescendant(descendant: KTree<T>) {
descendants.add(descendant)
}

fun findChild(node: T) = directChildren.find { it.data == node }

fun findDescendant(node: T) = findChild(node) ?: descendants.find { it.data == node }

override fun toString() = data.toString()
}

0 comments on commit 331448d

Please sign in to comment.